diff --git a/CMakeLists.txt b/CMakeLists.txt index d43c1f825..4477cf212 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -903,11 +903,12 @@ if (TARGET ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME}) choose the SYCL implementation to be used in the SYCL backend: ${PLSSVM_SYCL_BACKEND_NAME_LIST} (default: automatic) " ) + string(REPLACE ";" "|" PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST "${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST}") set(PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_MANPAGE_ENTRY " .TP .B --sycl_kernel_invocation_type -choose the kernel invocation type when using SYCL as backend: automatic|nd_range (default: automatic) +choose the kernel invocation type when using SYCL as backend: ${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST} (default: automatic) " ) endif () @@ -936,8 +937,6 @@ endif () # configure the manpage files configure_file(${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-train.1.in ${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-train.1 @ONLY) -# update manpage entry since plssvm-predict can't recognize the SYCL kernel invocation type -set(PLSSVM_SYCL_MANPAGE_ENTRY "${PLSSVM_SYCL_IMPLEMENTATION_TYPE_MANPAGE_ENTRY}") configure_file(${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-predict.1.in ${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-predict.1 @ONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-scale.1.in ${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-scale.1 @ONLY) diff --git a/README.md b/README.md index 797ba4133..5fb8ebe31 100644 --- a/README.md +++ b/README.md @@ -346,6 +346,8 @@ If the SYCL backend is available, additional options can be set. - `AUTO`: check for DPC++/icpx as implementation for the SYCL backend but **do not** fail if not available - `OFF`: do not check for DPC++/icpx as implementation for the SYCL backend +- `PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS` (default: `ON`): enable SYCL's `hierarchical` and AdaptiveCpp's `scoped` kernel invocation types + To use DPC++/icpx for SYCL, simply set the `CMAKE_CXX_COMPILER` to the respective DPC++/icpx clang executable during CMake invocation. If the SYCL implementation is DPC++/icpx the following additional options are available: @@ -684,7 +686,7 @@ Usage: -b, --backend arg choose the backend: automatic|openmp|hpx|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic) -p, --target_platform arg choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic) --sycl_kernel_invocation_type arg - choose the kernel invocation type when using SYCL as backend: automatic|nd_range (default: automatic) + choose the kernel invocation type when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic) --sycl_implementation_type arg choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic) --kokkos_execution_space arg @@ -745,7 +747,7 @@ The `--target_platform=automatic` option works for the different backends as fol - `stdpar`: target device must be selected at compile time (using `PLSSVM_TARGET_PLATFORMS`) or using environment variables at runtime The `--sycl_kernel_invocation_type` and `--sycl_implementation_type` flags are only used if the `--backend` is `sycl`, otherwise a warning is emitted on `stderr`. -If the `--sycl_kernel_invocation_type` is `automatic`, the `nd_range` invocation type is currently always used. +If the `--sycl_kernel_invocation_type` is `automatic`, the `work_group` invocation type is currently always used. If the `--sycl_implementation_type` is `automatic`, the used SYCL implementation is determined by the `PLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` CMake flag. If the `--kokkos_execution_space` is `automatic`, uses the best fitting execution space based on the provided and/or available target platforms. @@ -793,6 +795,8 @@ Usage: -b, --backend arg choose the backend: automatic|openmp|hpx|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic) -p, --target_platform arg choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic) + --sycl_kernel_invocation_type arg + choose the kernel invocation type when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic) --sycl_implementation_type arg choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic) --kokkos_execution_space arg diff --git a/bindings/Python/README.md b/bindings/Python/README.md index 196dbfc7b..504d2533b 100644 --- a/bindings/Python/README.md +++ b/bindings/Python/README.md @@ -332,10 +332,10 @@ The following table lists all PLSSVM enumerations exposed on the Python side: If a SYCL implementation is available, additional enumerations are available: -| enumeration | values | description | -|------------------------|-------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `ImplementationType` | `AUTOMATIC`, `DPCPP`, `ADAPTIVECPP` | The different supported SYCL implementation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, determines the used SYCL implementation based on the value of `-DPLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` provided during PLSSVM'S build step. | -| `KernelInvocationType` | `AUTOMATIC`, `ND_RANGE` | The different supported SYCL kernel invocation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, simply uses `ND_RANGE` (only implemented to be able to add new invocation types in the future). | +| enumeration | values | description | +|------------------------|--------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `ImplementationType` | `AUTOMATIC`, `DPCPP`, `ADAPTIVECPP` | The different supported SYCL implementation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, determines the used SYCL implementation based on the value of `-DPLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` provided during PLSSVM'S build step. | +| `KernelInvocationType` | `AUTOMATIC`, `BASIC`, `WORK_GROUP`, `HIERARCHICAL`, `SCOPED` | The different supported SYCL kernel invocation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, simply uses `WORK_GROUP`. | If the stdpar backend is available, an additional enumeration is available: diff --git a/bindings/Python/backends/sycl.cpp b/bindings/Python/backends/sycl.cpp index f2cc924d6..98c27214b 100644 --- a/bindings/Python/backends/sycl.cpp +++ b/bindings/Python/backends/sycl.cpp @@ -48,7 +48,10 @@ void init_sycl(py::module_ &m, const py::exception &base_exce py::enum_ py_enum_invocation(sycl_module, "KernelInvocationType", "Enum class for all possible SYCL kernel invocation types supported in PLSSVM."); py_enum_invocation .value("AUTOMATIC", plssvm::sycl::kernel_invocation_type::automatic, "use the best kernel invocation type for the current SYCL implementation and target hardware platform") - .value("ND_RANGE", plssvm::sycl::kernel_invocation_type::nd_range, "use the nd_range kernel invocation type"); + .value("BASIC", plssvm::sycl::kernel_invocation_type::basic, "use the basic data parallel kernel invocation type") + .value("WORK_GROUP", plssvm::sycl::kernel_invocation_type::work_group, "use the work-group data parallel kernel invocation type") + .value("HIERARCHICAL", plssvm::sycl::kernel_invocation_type::hierarchical, "use the hierarchical data parallel kernel invocation type") + .value("SCOPED", plssvm::sycl::kernel_invocation_type::scoped, "use the AdaptiveCpp specific scoped parallelism kernel invocation type"); // enable implicit conversion from string to enum plssvm::bindings::python::util::register_implicit_str_enum_conversion(py_enum_invocation); diff --git a/docs/resources/dirs.dox b/docs/resources/dirs.dox index dcb4337d0..412fe72b1 100644 --- a/docs/resources/dirs.dox +++ b/docs/resources/dirs.dox @@ -18,7 +18,7 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * - * @brief Directory containing the implementation of all four available backends: OpenMP, CUDA, OpenCL, and SYCL. + * @brief Directory containing the implementation of all available backends. */ /** @@ -488,6 +488,50 @@ * @brief Directory containing kernel implementations for the explicit CG algorithm using the SYCL backend. */ +/** + * @dir include/plssvm/backends/SYCL/kernel/cg_explicit/basic + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing basic data parallel kernel implementations for the explicit CG algorithm using the SYCL backend. + */ + +/** + * @dir include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing hierarchical kernel implementations for the explicit CG algorithm using the SYCL backend. + */ + +/** + * @dir include/plssvm/backends/SYCL/kernel/cg_explicit/scoped + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing scoped-parallelism kernel implementations for the explicit CG algorithm using the SYCL backend. + */ + +/** + * @dir include/plssvm/backends/SYCL/kernel/cg_explicit/work_group + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing work-group data parallel kernel implementations for the explicit CG algorithm using the SYCL backend. + */ + /** * @dir include/plssvm/backends/SYCL/kernel/cg_implicit * @author Alexander Van Craen @@ -499,6 +543,105 @@ * @brief Directory containing kernel implementations for the implicit CG algorithm using the SYCL backend. */ +/** + * @dir include/plssvm/backends/SYCL/kernel/cg_implicit/basic + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing basic data parallel kernel implementations for the implicit CG algorithm using the SYCL backend. + */ + +/** + * @dir include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing hierarchical kernel implementations for the implicit CG algorithm using the SYCL backend. + */ + +/** + * @dir include/plssvm/backends/SYCL/kernel/cg_implicit/scoped + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing scoped-parallelism kernel implementations for the implicit CG algorithm using the SYCL backend. + */ + +/** + * @dir include/plssvm/backends/SYCL/kernel/cg_implicit/work_group + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing work-group data parallel kernel implementations for the implicit CG algorithm using the SYCL backend. + */ + +/** + * @dir include/plssvm/backends/SYCL/kernel/predict + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing kernel implementations for the predictions using the SYCL backend. + */ + +/** + * @dir include/plssvm/backends/SYCL/kernel/predict/basic + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing basic data parallel kernel implementations for the predictions using the SYCL backend. + */ + +/** + * @dir include/plssvm/backends/SYCL/kernel/predict/hierarchical + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing hierarchical kernel implementations for the predictions using the SYCL backend. + */ + +/** + * @dir include/plssvm/backends/SYCL/kernel/predict/scoped + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing scoped-parallelism kernel implementations for the predictions using the SYCL backend. + */ + +/** + * @dir include/plssvm/backends/SYCL/kernel/predict/work_group + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing work-group data parallel kernel implementations for the predictions using the SYCL backend. + */ + /** * @dir include/plssvm/backends/SYCL/DPCPP * @author Alexander Van Craen diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp index 4fd639732..55b6a746b 100644 --- a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp +++ b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp @@ -23,6 +23,7 @@ #include "plssvm/detail/igor_utility.hpp" // plssvm::detail::get_value_from_named_parameter #include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size #include "plssvm/detail/type_traits.hpp" // PLSSVM_REQUIRES, plssvm::detail::is_one_type_of +#include "plssvm/exceptions/exceptions.hpp" // plssvm::invalid_parameter_exception #include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator #include "plssvm/parameter.hpp" // plssvm::parameter, plssvm::detail::{has_only_sycl_parameter_named_args_v, has_only_sycl_named_args_v} #include "plssvm/svm/csvc.hpp" // plssvm::csvc @@ -76,6 +77,14 @@ class csvm : public ::plssvm::detail::gpu_csvm(parser, sycl_kernel_invocation_type); + +#if !defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + if (invocation_type_ == sycl::kernel_invocation_type::hierarchical) { + throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::hierarchical is disabled for the AdaptiveCpp SYCL backend!" }; + } else if (invocation_type_ == sycl::kernel_invocation_type::scoped) { + throw ::plssvm::invalid_parameter_exception{ "he provided sycl::kernel_invocation_type::scoped is disabled for the AdaptiveCpp SYCL backend!" }; + } +#endif } this->init(target); } diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp index bb4ff90a6..23ffb1872 100644 --- a/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp +++ b/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp @@ -15,9 +15,11 @@ #include "plssvm/backends/execution_range.hpp" // plssvm::detail::dim_type #include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue.hpp" // plssvm::adaptivecpp::detail::queue (PImpl) +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/detail/utility.hpp" // plssvm::detail::unreachable #include "plssvm/target_platforms.hpp" // plssvm::target_platform -#include "sycl/sycl.hpp" // sycl::range +#include "sycl/sycl.hpp" // sycl::range, sycl::nd_range #include // std::string #include // std::pair @@ -46,6 +48,30 @@ template } } +/** + * @brief Convert the provided @p grid and @p block to the final SYCL execution range. + * @tparam invocation_type the SYCL kernel invocation type + * @param[in] grid the execution grid + * @param[in] block the execution block + * @return the SYCL native execution range + */ +template +auto get_execution_range(const ::plssvm::detail::dim_type &grid, const ::plssvm::detail::dim_type &block) { + const ::sycl::range native_grid = detail::dim_type_to_native<2>(grid); + const ::sycl::range native_block = detail::dim_type_to_native<2>(block); + + if constexpr (invocation_type == sycl::kernel_invocation_type::basic) { + return ::sycl::range<2>{ native_grid * native_block }; + } else if constexpr (invocation_type == sycl::kernel_invocation_type::work_group) { + return ::sycl::nd_range<2>{ native_grid * native_block, native_block }; + } else if constexpr (invocation_type == sycl::kernel_invocation_type::hierarchical || invocation_type == sycl::kernel_invocation_type::scoped) { + return ::sycl::nd_range<2>{ native_grid, native_block }; + } else { + // can't be reached + ::plssvm::detail::unreachable(); + } +} + /** * @brief Returns the list devices matching the target platform @p target and the actually used target platform * (only interesting if the provided @p target was automatic). diff --git a/include/plssvm/backends/SYCL/DPCPP/csvm.hpp b/include/plssvm/backends/SYCL/DPCPP/csvm.hpp index 4bcdc2da9..4b1a6b570 100644 --- a/include/plssvm/backends/SYCL/DPCPP/csvm.hpp +++ b/include/plssvm/backends/SYCL/DPCPP/csvm.hpp @@ -23,6 +23,7 @@ #include "plssvm/detail/igor_utility.hpp" // plssvm::detail::get_value_from_named_parameter #include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size #include "plssvm/detail/type_traits.hpp" // PLSSVM_REQUIRES, plssvm::detail::is_one_type_of +#include "plssvm/exceptions/exceptions.hpp" // plssvm::invalid_parameter_exception #include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator #include "plssvm/parameter.hpp" // plssvm::parameter, plssvm::detail::{has_only_sycl_parameter_named_args_v, has_only_sycl_named_args_v} #include "plssvm/svm/csvc.hpp" // plssvm::csvc @@ -63,6 +64,7 @@ class csvm : public ::plssvm::detail::gpu_csvm(parser, sycl_kernel_invocation_type); + // the invocation type "scoped" isn't supported by DPC++ + if (invocation_type_ == sycl::kernel_invocation_type::scoped) { + throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::scoped isn't supported by DPC++!" }; + } + +#if !defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + if (invocation_type_ == sycl::kernel_invocation_type::hierarchical) { + throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::hierarchical is disabled for the DPC++ SYCL backend!" }; + } +#endif } this->init(target); } diff --git a/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp b/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp index 327cf1ac7..d61a73407 100644 --- a/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp +++ b/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp @@ -13,11 +13,13 @@ #define PLSSVM_BACKENDS_SYCL_DPCPP_DETAIL_UTILITY_HPP_ #pragma once -#include "plssvm/backends/execution_range.hpp" // plssvm::detail::dim_type -#include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp" // plssvm::dpcpp::detail::queue (PImpl) -#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/backends/execution_range.hpp" // plssvm::detail::dim_type +#include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp" // plssvm::dpcpp::detail::queue (PImpl) +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/detail/utility.hpp" // plssvm::detail::unreachable +#include "plssvm/target_platforms.hpp" // plssvm::target_platform -#include "sycl/sycl.hpp" // sycl::range +#include "sycl/sycl.hpp" // sycl::range, sycl::nd_range #include // std::size_t #include // std::string @@ -47,6 +49,30 @@ template } } +/** + * @brief Convert the provided @p grid and @p block to the final SYCL execution range. + * @tparam invocation_type the SYCL kernel invocation type + * @param[in] grid the execution grid + * @param[in] block the execution block + * @return the SYCL native execution range + */ +template +auto get_execution_range(const ::plssvm::detail::dim_type &grid, const ::plssvm::detail::dim_type &block) { + const ::sycl::range native_grid = detail::dim_type_to_native<2>(grid); + const ::sycl::range native_block = detail::dim_type_to_native<2>(block); + + if constexpr (invocation_type == sycl::kernel_invocation_type::basic) { + return ::sycl::range<2>{ native_grid * native_block }; + } else if constexpr (invocation_type == sycl::kernel_invocation_type::work_group) { + return ::sycl::nd_range<2>{ native_grid * native_block, native_block }; + } else if constexpr (invocation_type == sycl::kernel_invocation_type::hierarchical) { + return ::sycl::nd_range<2>{ native_grid, native_block }; + } else { + // can't be reached + ::plssvm::detail::unreachable(); + } +} + /** * @brief Returns the list devices matching the target platform @p target and the actually used target platform * (only interesting if the provided @p target was automatic). diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp new file mode 100644 index 000000000..2e528149c --- /dev/null +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp @@ -0,0 +1,330 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Functions for explicitly performing a BLAS GEMM like matrix-matrix multiplication using the SYCL backend and the basic data parallel kernels. + */ + +#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_ +#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_ +#pragma once + +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} + +#include "sycl/sycl.hpp" // sycl::item + +#include // std::size_t + +namespace plssvm::sycl::detail::basic { + +/** + * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. + * @details Uses SYCL's basic data parallel kernels. + */ +class device_kernel_symm { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] num_rows the number of rows in @p A and @p C + * @param[in] num_rhs the number of columns in @p B and @p C + * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] row_offset the first row this device is responsible for + * @param[in] alpha the scalar alpha value + * @param[in] A the matrix @p A + * @param[in] B the matrix @p B + * @param[in] beta the scalar beta value + * @param[in,out] C the matrix @p C, also used as result matrix + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + num_rows_{ num_rows }, + num_rhs_{ num_rhs }, + device_specific_num_rows_{ device_specific_num_rows }, + row_offset_{ row_offset }, + alpha_{ alpha }, + A_{ A }, + B_{ B }, + beta_{ beta }, + C_{ C }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] idx indices representing the current point in the execution space + */ + void operator()(::sycl::item<2> idx) const { + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // calculate the indices used in the current work-item + const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + + // create a work-item private array used for internal caching + real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (unsigned long long dim = 0; dim < (num_rows_ - row_offset_); ++dim) { + // perform the dot product calculation + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = i + static_cast(internal_i); + const auto global_j = j + static_cast(internal_j); + + real_type A_val = 0.0; + // determine on which side of the diagonal we are located + if (dim < global_j) { + A_val = A_[dim * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }]; + } else { + A_val = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + } + + temp[internal_i][internal_j] += A_val * B_[(dim + row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + } + } + } + + // apply the (partial) BLAS operation and update C + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = i + static_cast(internal_i); + const auto device_global_j = j + static_cast(internal_j); + const auto global_j = row_offset_ + j + static_cast(internal_j); + + // be sure to not perform out of bounds accesses + if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { + C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + } + } + } + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_rows_; + const std::size_t num_rhs_; + const std::size_t device_specific_num_rows_; + const std::size_t row_offset_; + const real_type alpha_; + const real_type *A_; + const real_type *B_; + const real_type beta_; + real_type *C_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. + * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! + * Uses SYCL's basic data parallel kernels. + */ +class device_kernel_symm_mirror { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] num_rows the number of rows in @p A and @p C + * @param[in] num_rhs the number of columns in @p B and @p C + * @param[in] num_mirror_rows the number of rows to mirror down + * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] row_offset the first row this device is responsible for + * @param[in] alpha the scalar alpha value + * @param[in] A the matrix @p A + * @param[in] B the matrix @p B + * @param[in] beta the scalar beta value + * @param[in,out] C the matrix @p C, also used as result matrix + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + num_rows_{ num_rows }, + num_rhs_{ num_rhs }, + num_mirror_rows_{ num_mirror_rows }, + device_specific_num_rows_{ device_specific_num_rows }, + row_offset_{ row_offset }, + alpha_{ alpha }, + A_{ A }, + B_{ B }, + beta_{ beta }, + C_{ C }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] idx indices representing the current point in the execution space + */ + void operator()(::sycl::item<2> idx) const { + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // calculate the indices used in the current work-item + const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + + // create a work-item private array used for internal caching + real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + + // iterate over the remaining features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < device_specific_num_rows_; ++dim) { + // perform the feature reduction calculation + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = i + static_cast(internal_i); + const auto global_j = j + static_cast(internal_j); + + temp[internal_i][internal_j] += A_[(dim) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows_ - dim + global_j] * B_[(dim + row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + } + } + } + + // apply the (remaining) BLAS operation and update C + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = i + static_cast(internal_i); + const auto partial_global_j = j + static_cast(internal_j); + const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast(internal_j); + + // be sure to not perform out of bounds accesses + if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { + C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + } + } + } + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_rows_; + const std::size_t num_rhs_; + const std::size_t num_mirror_rows_; + const std::size_t device_specific_num_rows_; + const std::size_t row_offset_; + const real_type alpha_; + const real_type *A_; + const real_type *B_; + const real_type beta_; + real_type *C_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Perform a simple inplace matrix addition: lhs += rhs. + * @details Uses SYCL's basic data parallel kernels. + */ +class device_kernel_inplace_matrix_add { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] num_cols the number of columns in both matrices + * @param[in,out] lhs the first matrix (updated inplace) + * @param[in] rhs the second matrix + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + num_cols_{ num_cols }, + lhs_{ lhs }, + rhs_{ rhs }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] idx indices representing the current point in the execution space + */ + void operator()(::sycl::item<2> idx) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // calculate the indices used in the current work-item + const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = i + static_cast(internal_i); + const auto global_j = j + static_cast(internal_j); + + lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + } + } + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_cols_; + real_type *lhs_; + const real_type *rhs_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Perform a simple inplace matrix scale: lhs *= scalar. + * @details Uses SYCL's basic data parallel kernels. + */ +class device_kernel_inplace_matrix_scale { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] num_cols the number of columns in the matrix + * @param[in,out] lhs the first matrix (updated inplace) + * @param[in] scale the value to scale + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + num_cols_{ num_cols }, + lhs_{ lhs }, + scale_{ scale }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] idx indices representing the current point in the execution space + */ + void operator()(::sycl::item<2> idx) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // calculate the indices used in the current work-item + const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = i + static_cast(internal_i); + const auto global_j = j + static_cast(internal_j); + + lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + } + } + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_cols_; + real_type *lhs_; + const real_type scale_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +} // namespace plssvm::sycl::detail::basic + +#endif // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp new file mode 100644 index 000000000..65587ddaa --- /dev/null +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp @@ -0,0 +1,139 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Functions for explicitly assembling the kernel matrix using the SYCL backend and the basic data parallel kernels. + */ + +#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_HPP_ +#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_HPP_ +#pragma once + +#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type + +#include "sycl/sycl.hpp" // sycl::item + +#include // std::size_t +#include // std::tuple, std::make_tuple + +namespace plssvm::sycl::detail::basic { + +/** + * @brief Create the explicit kernel matrix using the @p kernel_function. + * @details Uses SYCL's basic data parallel kernels. + * @tparam kernel_function the type of the used kernel function + * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` + */ +template +class device_kernel_assembly { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[out] kernel_matrix_d the calculated kernel matrix + * @param[in] data_d the data points to calculate the kernel matrix from + * @param[in] num_rows the number of data points + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] num_features the number of features per data point + * @param[in] q the vector used in the dimensional reduction + * @param[in] QA_cost the scalar used in the dimensional reduction + * @param[in] cost the cost factor the diagonal is scaled with + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function + */ + device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + kernel_matrix_d_{ kernel_matrix_d }, + data_d_{ data_d }, + num_rows_{ num_rows }, + device_num_rows_{ device_num_rows }, + row_offset_{ row_offset }, + num_features_{ num_features }, + q_{ q }, + QA_cost_{ QA_cost }, + cost_{ cost }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] idx indices representing the current point in the execution space + */ + void operator()(::sycl::item<2> idx) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // calculate the indices used in the current work-item + const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + + if (i >= j) { + // create a work-item private array used for internal caching + real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + + for (std::size_t dim = 0; dim < num_features_; ++dim) { + // perform the feature reduction calculation + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = row_offset_ + i + static_cast(internal_i); + const auto global_j = row_offset_ + j + static_cast(internal_j); + temp[internal_i][internal_j] += detail::feature_reduce(data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i], + data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]); + } + } + } + + // apply the remaining part of the kernel function and store the value in the output kernel matrix + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the kernel matrix (the part stored on the current device) + const auto device_global_i = i + static_cast(internal_i); + const auto global_i = row_offset_ + i + static_cast(internal_i); + const auto device_global_j = j + static_cast(internal_j); + const auto global_j = row_offset_ + j + static_cast(internal_j); + + // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) + if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + real_type temp_ij = temp[internal_i][internal_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // apply the cost on the diagonal + if (global_i == global_j) { + temp_ij += cost_; + } + // update the kernel matrix + kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + } + } + } + } + } + + private: + /// @cond Doxygen_suppress + real_type *kernel_matrix_d_; + const real_type *data_d_; + const std::size_t num_rows_; + const std::size_t device_num_rows_; + const std::size_t row_offset_; + const std::size_t num_features_; + const real_type *q_; + const real_type QA_cost_; + const real_type cost_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::tuple kernel_function_parameter_; + /// @endcond +}; + +} // namespace plssvm::sycl::detail::basic + +#endif // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp new file mode 100644 index 000000000..de6358ec8 --- /dev/null +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp @@ -0,0 +1,472 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Functions for explicitly performing a BLAS GEMM like matrix-matrix multiplication using the SYCL backend and the hierarchical data parallel kernels. + */ + +#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_ +#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_ +#pragma once + +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} + +#include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item + +#include // std::size_t + +namespace plssvm::sycl::detail::hierarchical { + +/** + * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. + * @details Uses SYCL's hierarchical data parallel kernels. + */ +class device_kernel_symm { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] num_rows the number of rows in @p A and @p C + * @param[in] num_rhs the number of columns in @p B and @p C + * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] row_offset the first row this device is responsible for + * @param[in] alpha the scalar alpha value + * @param[in] A the matrix @p A + * @param[in] B the matrix @p B + * @param[in] beta the scalar beta value + * @param[in,out] C the matrix @p C, also used as result matrix + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + num_rows_{ num_rows }, + num_rhs_{ num_rhs }, + device_specific_num_rows_{ device_specific_num_rows }, + row_offset_{ row_offset }, + alpha_{ alpha }, + A_{ A }, + B_{ B }, + beta_{ beta }, + C_{ C }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] group indices representing the current point in the execution space + */ + void operator()(::sycl::group<2> group) const { + // allocate shared memory + real_type A_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type B_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + + // calculate the indices used in the current work-item + ::sycl::private_memory i{ group }; + ::sycl::private_memory i_linear{ group }; + ::sycl::private_memory j{ group }; + ::sycl::private_memory j_linear{ group }; + + ::sycl::private_memory temp{ group }; + + // initialize private and local variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension + const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension + const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension + const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + // indices + i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + + // initialize private temp matrix to zero + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + } + } + }); + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(FEATURE_BLOCK_SIZE)) { + // load data into local memory + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + const std::size_t threadIdx_x = idx.get_local_id(0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // determine on which side of the diagonal we are located + if (dim + threadIdx_x < global_j) { + A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; + } else { + A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + } + // determine on which side of the diagonal we are located + if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) { + A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }]; + } else { + A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + } + + B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + } + }); + + // implicit barrier + + // perform the dot product calculation + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + } + } + }); + + // implicit barrier + } + + // apply the (partial) BLAS operation and update C + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = i(idx) + static_cast(internal_i); + const auto device_global_j = j(idx) + static_cast(internal_j); + const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + + // be sure to not perform out of bounds accesses + if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { + C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + } + } + } + }); + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_rows_; + const std::size_t num_rhs_; + const std::size_t device_specific_num_rows_; + const std::size_t row_offset_; + const real_type alpha_; + const real_type *A_; + const real_type *B_; + const real_type beta_; + real_type *C_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. + * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! + * Uses SYCL's hierarchical data parallel kernels. + */ +class device_kernel_symm_mirror { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] num_rows the number of rows in @p A and @p C + * @param[in] num_rhs the number of columns in @p B and @p C + * @param[in] num_mirror_rows the number of rows to mirror down + * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] row_offset the first row this device is responsible for + * @param[in] alpha the scalar alpha value + * @param[in] A the matrix @p A + * @param[in] B the matrix @p B + * @param[in] beta the scalar beta value + * @param[in,out] C the matrix @p C, also used as result matrix + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + num_rows_{ num_rows }, + num_rhs_{ num_rhs }, + num_mirror_rows_{ num_mirror_rows }, + device_specific_num_rows_{ device_specific_num_rows }, + row_offset_{ row_offset }, + alpha_{ alpha }, + A_{ A }, + B_{ B }, + beta_{ beta }, + C_{ C }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] group indices representing the current point in the execution space + */ + void operator()(::sycl::group<2> group) const { + // allocate shared memory + real_type A_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type B_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + + // calculate the indices used in the current work-item + ::sycl::private_memory i{ group }; + ::sycl::private_memory i_linear{ group }; + ::sycl::private_memory j{ group }; + ::sycl::private_memory j_linear{ group }; + + ::sycl::private_memory temp{ group }; + + // initialize private and local variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension + const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension + const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension + const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + // indices and diagonal condition + i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + + // initialize private temp matrix to zero + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + } + } + }); + + // iterate over the remaining features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + // load data into shared memory + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + const std::size_t threadIdx_x = idx.get_local_id(0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; + A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j]; + + B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + } + }); + + // implicit barrier + + // perform the feature reduction calculation + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + } + } + }); + + // implicit barrier + } + + // apply the (remaining) BLAS operation and update C + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = i(idx) + static_cast(internal_i); + const auto partial_global_j = j(idx) + static_cast(internal_j); + const auto global_j = row_offset_ + device_specific_num_rows_ + j(idx) + static_cast(internal_j); + + // be sure to not perform out of bounds accesses + if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { + C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + } + } + } + }); + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_rows_; + const std::size_t num_rhs_; + const std::size_t num_mirror_rows_; + const std::size_t device_specific_num_rows_; + const std::size_t row_offset_; + const real_type alpha_; + const real_type *A_; + const real_type *B_; + const real_type beta_; + real_type *C_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Perform a simple inplace matrix addition: lhs += rhs. + * @details Uses SYCL's hierarchical data parallel kernels. + */ +class device_kernel_inplace_matrix_add { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] num_cols the number of columns in both matrices + * @param[in,out] lhs the first matrix (updated inplace) + * @param[in] rhs the second matrix + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + num_cols_{ num_cols }, + lhs_{ lhs }, + rhs_{ rhs }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] group indices representing the current point in the execution space + */ + void operator()(::sycl::group<2> group) const { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const std::size_t threadIdx_x = idx.get_local_id(0); + const std::size_t threadIdx_y = idx.get_local_id(1); + const std::size_t blockDim_x = idx.get_local_range(0); + const std::size_t blockDim_y = idx.get_local_range(1); + const std::size_t blockIdx_x = group[0] + grid_x_offset_; + const std::size_t blockIdx_y = group[1] + grid_y_offset_; + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // indices + const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + + for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { + for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { + const std::size_t global_i = i + internal_i; + const std::size_t global_j = j + internal_j; + + lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + } + } + }); + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_cols_; + real_type *lhs_; + const real_type *rhs_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Perform a simple inplace matrix scale: lhs *= scalar. + * @details Uses SYCL's hierarchical data parallel kernels. + */ +class device_kernel_inplace_matrix_scale { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] num_cols the number of columns in the matrix + * @param[in,out] lhs the first matrix (updated inplace) + * @param[in] scale the value to scale + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + num_cols_{ num_cols }, + lhs_{ lhs }, + scale_{ scale }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] group indices representing the current point in the execution space + */ + void operator()(::sycl::group<2> group) const { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const std::size_t threadIdx_x = idx.get_local_id(0); + const std::size_t threadIdx_y = idx.get_local_id(1); + const std::size_t blockDim_x = idx.get_local_range(0); + const std::size_t blockDim_y = idx.get_local_range(1); + const std::size_t blockIdx_x = group[0] + grid_x_offset_; + const std::size_t blockIdx_y = group[1] + grid_y_offset_; + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // indices + const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + + for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { + for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { + const std::size_t global_i = i + internal_i; + const std::size_t global_j = j + internal_j; + + lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + } + } + }); + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_cols_; + real_type *lhs_; + const real_type scale_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +} // namespace plssvm::sycl::detail::hierarchical + +#endif // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp new file mode 100644 index 000000000..b09fef0f8 --- /dev/null +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp @@ -0,0 +1,203 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Functions for explicitly assembling the kernel matrix using the SYCL backend and the hierarchical data parallel kernels. + */ + +#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_HPP_ +#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_HPP_ +#pragma once + +#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type + +#include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item + +#include // std::size_t +#include // std::tuple, std::make_tuple + +namespace plssvm::sycl::detail::hierarchical { + +/** + * @brief Create the explicit kernel matrix using the @p kernel_function. + * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam kernel_function the type of the used kernel function + * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` + */ +template +class device_kernel_assembly { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[out] kernel_matrix_d the calculated kernel matrix + * @param[in] data_d the data points to calculate the kernel matrix from + * @param[in] num_rows the number of data points + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] num_features the number of features per data point + * @param[in] q the vector used in the dimensional reduction + * @param[in] QA_cost the scalar used in the dimensional reduction + * @param[in] cost the cost factor the diagonal is scaled with + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function + */ + device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + kernel_matrix_d_{ kernel_matrix_d }, + data_d_{ data_d }, + num_rows_{ num_rows }, + device_num_rows_{ device_num_rows }, + row_offset_{ row_offset }, + num_features_{ num_features }, + q_{ q }, + QA_cost_{ QA_cost }, + cost_{ cost }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] group indices representing the current point in the execution space + */ + void operator()(::sycl::group<2> group) const { + // allocate shared memory + real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + + // calculate the indices used in the current work-item + ::sycl::private_memory i{ group }; + ::sycl::private_memory i_linear{ group }; + ::sycl::private_memory j{ group }; + ::sycl::private_memory j_linear{ group }; + + ::sycl::private_memory temp{ group }; + + // initialize private and local variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension + const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension + const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension + const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + // indices + i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + + // initialize private temp matrix to zero + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + } + } + }); + + // implicit group barrier + + // exploit symmetry + if (group[1] >= group[0]) { + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + // load data into shared memory + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + const std::size_t threadIdx_x = idx.get_local_id(0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; + data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; + data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + } + }); + + // implicit group barrier + + // perform the feature reduction calculation + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + } + } + }); + + // implicit barrier + } + + // apply the remaining part of the kernel function and store the value in the output kernel matrix + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the kernel matrix (the part stored on the current device) + const auto device_global_i = i(idx) + static_cast(internal_i); + const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); + const auto device_global_j = j(idx) + static_cast(internal_j); + const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + + // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) + if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + real_type temp_ij = temp(idx)[internal_i][internal_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // apply the cost on the diagonal + if (global_i == global_j) { + temp_ij += cost_; + } + // update the kernel matrix + kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + } + } + } + }); + } + } + + private: + /// @cond Doxygen_suppress + real_type *kernel_matrix_d_; + const real_type *data_d_; + const std::size_t num_rows_; + const std::size_t device_num_rows_; + const std::size_t row_offset_; + const std::size_t num_features_; + const real_type *q_; + const real_type QA_cost_; + const real_type cost_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::tuple kernel_function_parameter_; + /// @endcond +}; + +} // namespace plssvm::sycl::detail::hierarchical + +#endif // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp new file mode 100644 index 000000000..9e8500d73 --- /dev/null +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp @@ -0,0 +1,457 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Functions for explicitly performing a BLAS GEMM like matrix-matrix multiplication using the SYCL backend and AdaptiveCpp's scoped parallelism. + */ + +#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_ +#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_ +#pragma once + +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} + +#include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item + +#include // std::size_t + +namespace plssvm::sycl::detail::scoped { + +/** + * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. + * @details Uses AdaptiveCpp's scoped parallelism. + */ +class device_kernel_symm { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] num_rows the number of rows in @p A and @p C + * @param[in] num_rhs the number of columns in @p B and @p C + * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] row_offset the first row this device is responsible for + * @param[in] alpha the scalar alpha value + * @param[in] A the matrix @p A + * @param[in] B the matrix @p B + * @param[in] beta the scalar beta value + * @param[in,out] C the matrix @p C, also used as result matrix + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + num_rows_{ num_rows }, + num_rhs_{ num_rhs }, + device_specific_num_rows_{ device_specific_num_rows }, + row_offset_{ row_offset }, + alpha_{ alpha }, + A_{ A }, + B_{ B }, + beta_{ beta }, + C_{ C }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @tparam T the implementation defined type of the group to iterate + * @param[in] group group representing the current point in the execution space + */ + template + void operator()(T group) const { + ::sycl::memory_environment(group, + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &A_cache, auto &B_cache, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { + // initialize private and local variables + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension + const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension + const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension + const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + // indices + i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + }); + + for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(FEATURE_BLOCK_SIZE)) { + // load data into shared memory + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + const std::size_t threadIdx_x = idx.get_local_id(group, 0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // determine on which side of the diagonal we are located + if (dim + threadIdx_x < global_j) { + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; + } else { + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + } + // determine on which side of the diagonal we are located + if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) { + A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }]; + } else { + A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + } + + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + } + }); + + // perform calculations + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + } + } + }); + } + + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = i(idx) + static_cast(internal_i); + const auto device_global_j = j(idx) + static_cast(internal_j); + const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + + // be sure to not perform out of bounds accesses + if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { + C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + } + } + } + }); + }); + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_rows_; + const std::size_t num_rhs_; + const std::size_t device_specific_num_rows_; + const std::size_t row_offset_; + const real_type alpha_; + const real_type *A_; + const real_type *B_; + const real_type beta_; + real_type *C_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. + * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! + * Uses AdaptiveCpp's scoped parallelism. + */ +class device_kernel_symm_mirror { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] num_rows the number of rows in @p A and @p C + * @param[in] num_rhs the number of columns in @p B and @p C + * @param[in] num_mirror_rows the number of rows to mirror down + * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] row_offset the first row this device is responsible for + * @param[in] alpha the scalar alpha value + * @param[in] A the matrix @p A + * @param[in] B the matrix @p B + * @param[in] beta the scalar beta value + * @param[in,out] C the matrix @p C, also used as result matrix + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + num_rows_{ num_rows }, + num_rhs_{ num_rhs }, + num_mirror_rows_{ num_mirror_rows }, + device_specific_num_rows_{ device_specific_num_rows }, + row_offset_{ row_offset }, + alpha_{ alpha }, + A_{ A }, + B_{ B }, + beta_{ beta }, + C_{ C }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @tparam T the implementation defined type of the group to iterate + * @param[in] group group representing the current point in the execution space + */ + template + void operator()(T group) const { + ::sycl::memory_environment(group, + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &A_cache, auto &B_cache, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { + // initialize private and local variables + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension + const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension + const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension + const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + // indices + i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + }); + + for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + // load data into shared memory + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + const std::size_t threadIdx_x = idx.get_local_id(group, 0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; + A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j]; + + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + } + }); + + // perform calculations + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + } + } + }); + } + + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = i(idx) + static_cast(internal_i); + const auto partial_global_j = j(idx) + static_cast(internal_j); + const auto global_j = row_offset_ + device_specific_num_rows_ + j(idx) + static_cast(internal_j); + + // be sure to not perform out of bounds accesses + if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { + C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + } + } + } + }); + }); + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_rows_; + const std::size_t num_rhs_; + const std::size_t num_mirror_rows_; + const std::size_t device_specific_num_rows_; + const std::size_t row_offset_; + const real_type alpha_; + const real_type *A_; + const real_type *B_; + const real_type beta_; + real_type *C_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Perform a simple inplace matrix addition: lhs += rhs. + * @details Uses AdaptiveCpp's scoped parallelism. + */ +class device_kernel_inplace_matrix_add { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] num_cols the number of columns in both matrices + * @param[in,out] lhs the first matrix (updated inplace) + * @param[in] rhs the second matrix + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + num_cols_{ num_cols }, + lhs_{ lhs }, + rhs_{ rhs }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @tparam T the implementation defined type of the group to iterate + * @param[in] group group representing the current point in the execution space + */ + template + void operator()(T group) const { + ::sycl::memory_environment(group, + [&]() { + // scale + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const std::size_t threadIdx_x = idx.get_local_id(group, 0); + const std::size_t threadIdx_y = idx.get_local_id(group, 1); + const std::size_t blockDim_x = group.get_logical_local_range(0); + const std::size_t blockDim_y = group.get_logical_local_range(1); + const std::size_t blockIdx_x = group[0] + grid_x_offset_; + const std::size_t blockIdx_y = group[1] + grid_y_offset_; + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // indices + const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + + for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { + for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { + const std::size_t global_i = i + internal_i; + const std::size_t global_j = j + internal_j; + + lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + } + } + }); + }); + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_cols_; + real_type *lhs_; + const real_type *rhs_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Perform a simple inplace matrix scale: lhs *= scalar. + * @details Uses AdaptiveCpp's scoped parallelism. + */ +class device_kernel_inplace_matrix_scale { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] num_cols the number of columns in the matrix + * @param[in,out] lhs the first matrix (updated inplace) + * @param[in] scale the value to scale + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + num_cols_{ num_cols }, + lhs_{ lhs }, + scale_{ scale }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @tparam T the implementation defined type of the group to iterate + * @param[in] group group representing the current point in the execution space + */ + template + void operator()(T group) const { + ::sycl::memory_environment(group, + [&]() { + // scale + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const std::size_t threadIdx_x = idx.get_local_id(group, 0); + const std::size_t threadIdx_y = idx.get_local_id(group, 1); + const std::size_t blockDim_x = group.get_logical_local_range(0); + const std::size_t blockDim_y = group.get_logical_local_range(1); + const std::size_t blockIdx_x = group[0] + grid_x_offset_; + const std::size_t blockIdx_y = group[1] + grid_y_offset_; + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // indices + const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + + for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { + for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { + const std::size_t global_i = i + internal_i; + const std::size_t global_j = j + internal_j; + + lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + } + } + }); + }); + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_cols_; + real_type *lhs_; + const real_type scale_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +} // namespace plssvm::sycl::detail::scoped + +#endif // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp new file mode 100644 index 000000000..4ed3764ce --- /dev/null +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp @@ -0,0 +1,189 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Functions for explicitly assembling the kernel matrix using the SYCL backend and AdaptiveCpp's scoped parallelism. + */ + +#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_HPP_ +#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_HPP_ +#pragma once + +#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type + +#include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item + +#include // std::size_t +#include // std::tuple, std::make_tuple + +namespace plssvm::sycl::detail::scoped { + +/** + * @brief Create the explicit kernel matrix using the @p kernel_function. + * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam kernel_function the type of the used kernel function + * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` + */ +template +class device_kernel_assembly { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[out] kernel_matrix_d the calculated kernel matrix + * @param[in] data_d the data points to calculate the kernel matrix from + * @param[in] num_rows the number of data points + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] num_features the number of features per data point + * @param[in] q the vector used in the dimensional reduction + * @param[in] QA_cost the scalar used in the dimensional reduction + * @param[in] cost the cost factor the diagonal is scaled with + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function + */ + device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + kernel_matrix_d_{ kernel_matrix_d }, + data_d_{ data_d }, + num_rows_{ num_rows }, + device_num_rows_{ device_num_rows }, + row_offset_{ row_offset }, + num_features_{ num_features }, + q_{ q }, + QA_cost_{ QA_cost }, + cost_{ cost }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + } + + /** + * @brief Function call operator overload performing the actual calculation. + * @tparam T the implementation defined type of the group to iterate + * @param[in] group group representing the current point in the execution space + */ + template + void operator()(T group) const { + ::sycl::memory_environment(group, + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &data_cache_i, auto &data_cache_j, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { + // initialize private and local variables + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension + const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension + const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension + const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + // indices + i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + }); + + // exploit symmetry + if (group[1] >= group[0]) { + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + // load data into shared memory + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + const std::size_t threadIdx_x = idx.get_local_id(group, 0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; + data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; + data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + } + }); + + // perform calculations + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + } + } + }); + } + + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the kernel matrix (the part stored on the current device) + const auto device_global_i = i(idx) + static_cast(internal_i); + const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); + const auto device_global_j = j(idx) + static_cast(internal_j); + const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + + // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) + if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + real_type temp_ij = temp(idx)[internal_i][internal_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // apply the cost on the diagonal + if (global_i == global_j) { + temp_ij += cost_; + } + // update the kernel matrix + kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + } + } + } + }); + } + }); + } + + private: + /// @cond Doxygen_suppress + real_type *kernel_matrix_d_; + const real_type *data_d_; + const std::size_t num_rows_; + const std::size_t device_num_rows_; + const std::size_t row_offset_; + const std::size_t num_features_; + const real_type *q_; + const real_type QA_cost_; + const real_type cost_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::tuple kernel_function_parameter_; + /// @endcond +}; + +} // namespace plssvm::sycl::detail::scoped + +#endif // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp similarity index 94% rename from include/plssvm/backends/SYCL/kernel/cg_explicit/blas.hpp rename to include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp index c5cfca67f..ae07f7ec6 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp @@ -6,23 +6,24 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * - * @brief Functions for explicitly performing a BLAS GEMM like matrix-matrix multiplication using the SYCL backend. + * @brief Functions for explicitly performing a BLAS GEMM like matrix-matrix multiplication using the SYCL backend and the work-group data parallel kernels. */ -#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BLAS_HPP_ -#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BLAS_HPP_ +#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_ +#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_ #pragma once #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} -#include "sycl/sycl.hpp" // sycl::nd_item +#include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor #include // std::size_t -namespace plssvm::sycl::detail { +namespace plssvm::sycl::detail::work_group { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. + * @details Uses SYCL's work-group data parallel kernels. */ class device_kernel_symm { public: @@ -87,20 +88,20 @@ class device_kernel_symm { real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < (num_rows_ - row_offset_); dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += FEATURE_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // determine on which side of the diagonal we are located - if (dim + nd_idx.get_local_id(0) < global_j) { + if (dim + threadIdx_x < global_j) { A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; } else { A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; } // determine on which side of the diagonal we are located - if (dim + nd_idx.get_local_id(0) + THREAD_BLOCK_SIZE < global_j) { + if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) { A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }]; } else { A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; @@ -161,6 +162,7 @@ class device_kernel_symm { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! + * Uses SYCL's work-group data parallel kernels. */ class device_kernel_symm_mirror { public: @@ -234,8 +236,8 @@ class device_kernel_symm_mirror { const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + nd_idx.get_local_id(0)) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + nd_idx.get_local_id(0)) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + nd_idx.get_local_id(0)) + global_j]; - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + nd_idx.get_local_id(0) + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + nd_idx.get_local_id(0) + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + nd_idx.get_local_id(0) + THREAD_BLOCK_SIZE_uz) + global_j]; + A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; + A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j]; B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; @@ -292,6 +294,7 @@ class device_kernel_symm_mirror { /** * @brief Perform a simple inplace matrix addition: lhs += rhs. + * @details Uses SYCL's work-group data parallel kernels. */ class device_kernel_inplace_matrix_add { public: @@ -351,6 +354,7 @@ class device_kernel_inplace_matrix_add { /** * @brief Perform a simple inplace matrix scale: lhs *= scalar. + * @details Uses SYCL's work-group data parallel kernels. */ class device_kernel_inplace_matrix_scale { public: @@ -408,6 +412,6 @@ class device_kernel_inplace_matrix_scale { /// @endcond }; -} // namespace plssvm::sycl::detail +} // namespace plssvm::sycl::detail::work_group -#endif // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BLAS_HPP_ +#endif // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp similarity index 95% rename from include/plssvm/backends/SYCL/kernel/cg_explicit/kernel_matrix_assembly.hpp rename to include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp index a00fa2d4a..96030fbe7 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp @@ -6,26 +6,27 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * - * @brief Functions for explicitly assembling the kernel matrix using the SYCL backend. + * @brief Functions for explicitly assembling the kernel matrix using the SYCL backend and the work-group data parallel kernels. */ -#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_KERNEL_MATRIX_ASSEMBLY_HPP_ -#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_KERNEL_MATRIX_ASSEMBLY_HPP_ +#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_HPP_ +#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_HPP_ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type -#include "sycl/sycl.hpp" // sycl::nd_item +#include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor #include // std::size_t #include // std::tuple, std::make_tuple -namespace plssvm::sycl::detail { +namespace plssvm::sycl::detail::work_group { /** * @brief Create the explicit kernel matrix using the @p kernel_function. + * @details Uses SYCL's work-group data parallel kernels. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ @@ -171,6 +172,6 @@ class device_kernel_assembly { /// @endcond }; -} // namespace plssvm::sycl::detail +} // namespace plssvm::sycl::detail::work_group -#endif // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_KERNEL_MATRIX_ASSEMBLY_HPP_ +#endif // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp new file mode 100644 index 000000000..7b517a7b1 --- /dev/null +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp @@ -0,0 +1,159 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Functions for implicitly assembling the kernel matrix using the SYCL backend and the basic data parallel kernels. + */ + +#ifndef PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ +#define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ +#pragma once + +#include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op +#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type + +#include "sycl/sycl.hpp" // sycl::item + +#include // std::size_t +#include // std::tuple, std::make_tuple + +namespace plssvm::sycl::detail::basic { + +/** + * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. + * @details Uses SYCL's basic data parallel kernels. + * @tparam kernel_function the type of the used kernel function + * @tparam Args the types of the parameters necessary for the specific kernel function + */ +template +class device_kernel_assembly_symm { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] alpha the scalar alpha value + * @param[in] q the vector used in the dimensional reduction + * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] num_rows the total number of data points (= total number of rows) + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] num_features the number of features per data point + * @param[in] QA_cost the scalar used in the dimensional reduction + * @param[in] cost the cost factor the diagonal is scaled with + * @param[in] B the matrix @p B + * @param[in,out] C the matrix @p C + * @param[in] num_classes the number of classes in the data set + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function + */ + device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + alpha_{ alpha }, + q_{ q }, + data_d_{ data_d }, + num_rows_{ num_rows }, + device_num_rows_{ device_num_rows }, + row_offset_{ row_offset }, + num_features_{ num_features }, + QA_cost_{ QA_cost }, + cost_{ cost }, + B_{ B }, + C_{ C }, + num_classes_{ num_classes }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] idx indices representing the current point in the execution space + */ + void operator()(::sycl::item<2> idx) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // calculate the indices used in the current work-item + const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + + // only calculate the upper triangular matrix + if (i >= j) { + // create a work-item private array used for internal caching + real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; ++dim) { + // perform the feature reduction calculation + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = row_offset_ + i + static_cast(internal_i); + const auto global_j = row_offset_ + j + static_cast(internal_j); + + temp[internal_i][internal_j] += detail::feature_reduce(data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i], + data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]); + } + } + } + + // apply the remaining part of the kernel function and store the value in the output kernel matrix + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = row_offset_ + i + static_cast(internal_i); + const auto device_global_i = i + static_cast(internal_i); + const auto global_j = row_offset_ + j + static_cast(internal_j); + const auto device_global_j = j + static_cast(internal_j); + + // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) + if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + real_type temp_ij = temp[internal_i][internal_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // apply the cost on the diagonal + if (global_i == global_j) { + temp_ij += cost_; + // calculate the values of alpha * A * B + for (std::size_t class_idx = 0; class_idx < num_classes_; ++class_idx) { + detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx]; + } + } else { + // calculate the values of alpha * A * B + for (std::size_t class_idx = 0; class_idx < num_classes_; ++class_idx) { + detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + class_idx]; + // symmetry + detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx]; + } + } + } + } + } + } + } + + private: + /// @cond Doxygen_suppress + const real_type alpha_; + const real_type *q_; + const real_type *data_d_; + const std::size_t num_rows_; + const std::size_t device_num_rows_; + const std::size_t row_offset_; + const std::size_t num_features_; + const real_type QA_cost_; + const real_type cost_; + const real_type *B_; + real_type *C_; + const std::size_t num_classes_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::tuple kernel_function_parameter_; + /// @endcond +}; + +} // namespace plssvm::sycl::detail::basic + +#endif // PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp new file mode 100644 index 000000000..d2f7b0a5c --- /dev/null +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp @@ -0,0 +1,366 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Functions for implicitly assembling the kernel matrix using the SYCL backend and the hierarchical data parallel kernels. + */ + +#ifndef PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ +#define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ +#pragma once + +#include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op +#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type + +#include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item + +#include // std::size_t +#include // std::tuple, std::make_tuple + +namespace plssvm::sycl::detail::hierarchical { + +/** + * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. + * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam kernel_function the type of the used kernel function + * @tparam Args the types of the parameters necessary for the specific kernel function + */ +template +class device_kernel_assembly_symm { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] alpha the scalar alpha value + * @param[in] q the vector used in the dimensional reduction + * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] num_rows the total number of data points (= total number of rows) + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] num_features the number of features per data point + * @param[in] QA_cost the scalar used in the dimensional reduction + * @param[in] cost the cost factor the diagonal is scaled with + * @param[in] B the matrix @p B + * @param[in,out] C the matrix @p C + * @param[in] num_classes the number of classes in the data set + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function + */ + device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + alpha_{ alpha }, + q_{ q }, + data_d_{ data_d }, + num_rows_{ num_rows }, + device_num_rows_{ device_num_rows }, + row_offset_{ row_offset }, + num_features_{ num_features }, + QA_cost_{ QA_cost }, + cost_{ cost }, + B_{ B }, + C_{ C }, + num_classes_{ num_classes }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] group indices representing the current point in the execution space + */ + void operator()(::sycl::group<2> group) const { + // allocate shared memory + real_type data_cache_i[FEATURE_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_j[FEATURE_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + + // calculate the indices used in the current work-item + ::sycl::private_memory i{ group }; + ::sycl::private_memory i_linear{ group }; + ::sycl::private_memory j{ group }; + ::sycl::private_memory j_linear{ group }; + + ::sycl::private_memory temp{ group }; + + // initialize private and local variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension + const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension + const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension + const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + // indices + i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + + // initialize private temp matrix to zero + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + } + } + }); + + // implicit group barrier + + // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further + if (group[1] >= group[0]) { + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + // load data into local memory + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + const std::size_t threadIdx_x = idx.get_local_id(0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + data_cache_i[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; + data_cache_i[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; + data_cache_j[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + data_cache_j[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + } + }); + + // implicit group barrier + + // perform the feature reduction calculation + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_cache_j[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + } + } + }); + + // implicit group barrier + } + + // apply the remaining part of the kernel function and store the value in the output kernel matrix + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); + const auto device_global_i = i(idx) + static_cast(internal_i); + const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + const auto device_global_j = j(idx) + static_cast(internal_j); + + // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) + if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + temp(idx)[internal_i][internal_j] = detail::apply_kernel_function(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // apply the cost on the diagonal + if (global_i == global_j) { + temp(idx)[internal_i][internal_j] += cost_; + } + } else { + // be sure to set the value to zero otherwise + temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + } + } + } + }); + + // implicit group barrier + + // calculate C += alpha * temp * B for the UPPER triangular matrix + { + // allocate shared memory + auto &B_cache = data_cache_i; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] + auto &C_out_cache = data_cache_j; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + // load data into local memory + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + const std::size_t threadIdx_x = idx.get_local_id(0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; + B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; + C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; + C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + } + }); + + // implicit group barrier + + // calculate intermediate results and store them in shared memory + for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE] += + temp(idx)[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE]; + } + } + }); + + // implicit group barrier + } + + // add intermediate cached results to C + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + const std::size_t threadIdx_y = idx.get_local_id(1); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_j = row_offset_ + j(idx) + static_cast(internal); + detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1 + THREAD_BLOCK_SIZE]; + } + }); + + // implicit group barrier + } + } + + // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); + const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + + if (global_i == global_j) { + temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + } + } + } + }); + + // implicit group barrier + + // calculate C += alpha * temp * B for the LOWER triangular matrix + { + // allocate shared memory + auto &B_cache = data_cache_i; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &C_out_cache = data_cache_j; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + const std::size_t threadIdx_x = idx.get_local_id(0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; + B_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; + C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; + C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; + } + }); + + // implicit group barrier + + // calculate intermediate results and store them in shared memory + for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + C_out_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; + } + } + }); + + // implicit group barrier + } + + // add intermediate cached results to C + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + const std::size_t threadIdx_x = idx.get_local_id(0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = row_offset_ + i(idx) + static_cast(internal); + detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; + } + }); + + // implicit group barrier + } + } + } + } + + private: + /// @cond Doxygen_suppress + const real_type alpha_; + const real_type *q_; + const real_type *data_d_; + const std::size_t num_rows_; + const std::size_t device_num_rows_; + const std::size_t row_offset_; + const std::size_t num_features_; + const real_type QA_cost_; + const real_type cost_; + const real_type *B_; + real_type *C_; + const std::size_t num_classes_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::tuple kernel_function_parameter_; + /// @endcond +}; + +} // namespace plssvm::sycl::detail::hierarchical + +#endif // PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp new file mode 100644 index 000000000..4391f2f19 --- /dev/null +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp @@ -0,0 +1,343 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Functions for implicitly assembling the kernel matrix using the SYCL backend and AdaptiveCpp's scoped parallelism. + */ + +#ifndef PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ +#define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ +#pragma once + +#include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op +#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type + +#include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item + +#include // std::size_t +#include // std::tuple, std::make_tuple + +namespace plssvm::sycl::detail::scoped { + +/** + * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. + * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam kernel_function the type of the used kernel function + * @tparam Args the types of the parameters necessary for the specific kernel function + */ +template +class device_kernel_assembly_symm { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] alpha the scalar alpha value + * @param[in] q the vector used in the dimensional reduction + * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] num_rows the total number of data points (= total number of rows) + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] num_features the number of features per data point + * @param[in] QA_cost the scalar used in the dimensional reduction + * @param[in] cost the cost factor the diagonal is scaled with + * @param[in] B the matrix @p B + * @param[in,out] C the matrix @p C + * @param[in] num_classes the number of classes in the data set + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function + */ + device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + alpha_{ alpha }, + q_{ q }, + data_d_{ data_d }, + num_rows_{ num_rows }, + device_num_rows_{ device_num_rows }, + row_offset_{ row_offset }, + num_features_{ num_features }, + QA_cost_{ QA_cost }, + cost_{ cost }, + B_{ B }, + C_{ C }, + num_classes_{ num_classes }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @tparam T the implementation defined type of the group to iterate + * @param[in] group group representing the current point in the execution space + */ + template + void operator()(T group) const { + ::sycl::memory_environment(group, + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &data_cache_i, auto &data_cache_j, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { + // initialize private and local variables + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension + const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension + const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension + const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + // indices + i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + }); + + // exploit symmetry + if (group[1] >= group[0]) { + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + // load data into local memory + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + const std::size_t threadIdx_x = idx.get_local_id(group, 0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + data_cache_i[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; + data_cache_i[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; + data_cache_j[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + data_cache_j[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + } + }); + + // perform the feature reduction calculation + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_cache_j[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + } + } + }); + } + + // apply the remaining part of the kernel function and store the value in the output kernel matrix + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); + const auto device_global_i = i(idx) + static_cast(internal_i); + const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + const auto device_global_j = j(idx) + static_cast(internal_j); + + // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) + if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + temp(idx)[internal_i][internal_j] = detail::apply_kernel_function(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // apply the cost on the diagonal + if (global_i == global_j) { + temp(idx)[internal_i][internal_j] += cost_; + } + } else { + // be sure to set the value to zero otherwise + temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + } + } + } + }); + + // calculate C += alpha * temp * B for the UPPER triangular matrix + { + // rename cached arrays + auto &B_cache = data_cache_i; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] + auto &C_out_cache = data_cache_j; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + // load data into local memory + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + const std::size_t threadIdx_x = idx.get_local_id(group, 0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; + B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; + C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; + C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + } + }); + + // calculate intermediate results and store them in shared memory + for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE] += + temp(idx)[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE]; + } + } + }); + } + + // add intermediate cached results to C + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + const std::size_t threadIdx_y = idx.get_local_id(group, 1); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_j = row_offset_ + j(idx) + static_cast(internal); + detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1 + THREAD_BLOCK_SIZE]; + } + }); + } + } + + // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); + const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + + if (global_i == global_j) { + temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + } + } + } + }); + + // calculate C += alpha * temp * B for the LOWER triangular matrix + { + // allocate shared memory + auto &B_cache = data_cache_i; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &C_out_cache = data_cache_j; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + const std::size_t threadIdx_x = idx.get_local_id(group, 0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; + B_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; + C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; + C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; + } + }); + + // implicit group barrier + + // calculate intermediate results and store them in shared memory + for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + C_out_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; + } + } + }); + + // implicit group barrier + } + + // add intermediate cached results to C + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + const std::size_t threadIdx_x = idx.get_local_id(group, 0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = row_offset_ + i(idx) + static_cast(internal); + detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; + } + }); + + // implicit group barrier + } + } + } + }); + } + + private: + /// @cond Doxygen_suppress + const real_type alpha_; + const real_type *q_; + const real_type *data_d_; + const std::size_t num_rows_; + const std::size_t device_num_rows_; + const std::size_t row_offset_; + const std::size_t num_features_; + const real_type QA_cost_; + const real_type cost_; + const real_type *B_; + real_type *C_; + const std::size_t num_classes_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::tuple kernel_function_parameter_; + /// @endcond +}; + +} // namespace plssvm::sycl::detail::scoped + +#endif // PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp similarity index 97% rename from include/plssvm/backends/SYCL/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp rename to include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp index c1a337107..34b55fff4 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp @@ -6,11 +6,11 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * - * @brief Functions for implicitly assembling the kernel matrix using the SYCL backend. + * @brief Functions for implicitly assembling the kernel matrix using the SYCL backend and the work-group data parallel kernels. */ -#ifndef PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ -#define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ +#ifndef PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ +#define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ #pragma once #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op @@ -18,15 +18,16 @@ #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type -#include "sycl/sycl.hpp" // sycl::nd_item +#include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor #include // std::size_t #include // std::tuple, std::make_tuple -namespace plssvm::sycl::detail { +namespace plssvm::sycl::detail::work_group { /** * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. + * @details Uses SYCL's work-group data parallel kernels. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ @@ -275,6 +276,6 @@ class device_kernel_assembly_symm { /// @endcond }; -} // namespace plssvm::sycl::detail +} // namespace plssvm::sycl::detail::work_group -#endif // PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ +#endif // PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp new file mode 100644 index 000000000..c16965cb1 --- /dev/null +++ b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp @@ -0,0 +1,310 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines the functions used for prediction for the C-SVM using the SYCL backend and the work-group data parallel kernels. + */ + +#ifndef PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_BASIC_PREDICT_KERNEL_HPP_ +#define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_BASIC_PREDICT_KERNEL_HPP_ +#pragma once + +#include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op +#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type + +#include "sycl/sycl.hpp" // sycl::item + +#include // std::size_t +#include // std::tuple, std::make_tuple + +namespace plssvm::sycl::detail::basic { + +/** + * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. + * @details Uses SYCL's basic data parallel kernels. + */ +class device_kernel_w_linear { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in,out] w_d the vector to speedup the linear prediction + * @param[in] alpha_d the previously learned weights + * @param[in] sv_d the support vectors + * @param[in] num_classes the number of classes + * @param[in] num_sv the number of support vectors + * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for + * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + w_d_{ w_d }, + alpha_d_{ alpha_d }, + sv_d_{ sv_d }, + num_classes_{ num_classes }, + num_sv_{ num_sv }, + device_specific_num_sv_{ device_specific_num_sv }, + sv_offset_{ sv_offset }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] idx indices representing the current point in the execution space + */ + void operator()(::sycl::item<2> idx) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // calculate the indices used in the current work-item + const std::size_t feature_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + + // create a work-item private array used for internal caching + real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t sv = 0; sv < device_specific_num_sv_; ++sv) { + // perform the dot product calculation + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + const auto global_class_idx = class_idx + static_cast(internal_class); + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + + temp[internal_feature][internal_class] += alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_] * sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv]; + } + } + } + + // update global array with local one + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + const auto global_class_idx = class_idx + static_cast(internal_class); + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + + w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; + } + } + } + + private: + /// @cond Doxygen_suppress + real_type *w_d_; + const real_type *alpha_d_; + const real_type *sv_d_; + const std::size_t num_classes_; + const std::size_t num_sv_; + const std::size_t device_specific_num_sv_; + const std::size_t sv_offset_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @details Uses SYCL's basic data parallel kernels. + */ +class device_kernel_predict_linear { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[out] prediction_d the predicted values + * @param[in] w_d the vector to speedup the calculations + * @param[in] rho_d the previously learned bias + * @param[in] predict_points_d the data points to predict + * @param[in] num_classes the number of classes + * @param[in] num_predict_points the number of data points to predict + * @param[in] num_features the number of features per data point + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + prediction_d_{ prediction_d }, + w_d_{ w_d }, + rho_d_{ rho_d }, + predict_points_d_{ predict_points_d }, + num_classes_{ num_classes }, + num_predict_points_{ num_predict_points }, + num_features_{ num_features }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] idx indices representing the current point in the execution space + */ + void operator()(::sycl::item<2> idx) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // calculate the indices used in the current work-item + const std::size_t pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + + // create a work-item private array used for internal caching + real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; ++dim) { + // perform the dot product calculation + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + const auto global_pp_idx = pp_idx + static_cast(internal_pd); + const auto global_class_idx = class_idx + static_cast(internal_class); + + temp[internal_pd][internal_class] += w_d_[dim * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] * predict_points_d_[dim * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; + } + } + } + + // update global array with local one + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + const auto global_class_idx = class_idx + static_cast(internal_class); + const auto global_pp_idx = pp_idx + static_cast(internal_pd); + + prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx]; + } + } + } + + private: + /// @cond Doxygen_suppress + real_type *prediction_d_; + const real_type *w_d_; + const real_type *rho_d_; + const real_type *predict_points_d_; + const std::size_t num_classes_; + const std::size_t num_predict_points_; + const std::size_t num_features_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Predict the @p predict_points_d using the @p kernel_function. + * @details Uses SYCL's basic data parallel kernels. + * @tparam kernel_function the type of the used kernel function + * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` + */ +template +class device_kernel_predict { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] prediction_d the predicted values + * @param[in] alpha_d the previously learned weights + * @param[in] rho_d the previously learned biases + * @param[in] sv_d the support vectors + * @param[in] predict_points_d the data points to predict + * @param[in] num_classes the number of classes + * @param[in] num_sv the number of support vectors + * @param[in] num_predict_points the number of data points to predict + * @param[in] num_features the number of features per data point + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function + */ + device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + prediction_d_{ prediction_d }, + alpha_d_{ alpha_d }, + rho_d_{ rho_d }, + sv_d_{ sv_d }, + predict_points_d_{ predict_points_d }, + num_classes_{ num_classes }, + num_sv_{ num_sv }, + num_predict_points_{ num_predict_points }, + num_features_{ num_features }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] idx indices representing the current point in the execution space + */ + void operator()(::sycl::item<2> idx) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // calculate the indices used in the current work-item + const std::size_t pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t sv_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + + // create a work-item private array used for internal caching + real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; ++dim) { + // perform the feature reduction calculation + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + const auto global_pp_idx = pp_idx + static_cast(internal_pd); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); + + temp[internal_pd][internal_sv] += detail::feature_reduce(sv_d_[dim * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx], + predict_points_d_[dim * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]); + } + } + } + + // update temp using the respective kernel function + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter_); + } + } + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_classes_; ++dim) { + if (sv_idx == 0) { + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + const auto global_pp_idx = pp_idx + static_cast(internal_pd); + detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim] } += -rho_d_[dim]; + } + } + + // calculate intermediate results and store them in local memory + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + const auto global_pp_idx = pp_idx + static_cast(internal_pd); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); + + detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim] } += + temp[internal_pd][internal_sv] * alpha_d_[dim * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + } + } + } + } + + private: + /// @cond Doxygen_suppress + real_type *prediction_d_; + const real_type *alpha_d_; + const real_type *rho_d_; + const real_type *sv_d_; + const real_type *predict_points_d_; + const std::size_t num_classes_; + const std::size_t num_sv_; + const std::size_t num_predict_points_; + const std::size_t num_features_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::tuple kernel_function_parameter_; + /// @endcond +}; + +} // namespace plssvm::sycl::detail::basic + +#endif // PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_BASIC_PREDICT_KERNEL_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp new file mode 100644 index 000000000..4098c4914 --- /dev/null +++ b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp @@ -0,0 +1,547 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines the functions used for prediction for the C-SVM using the SYCL backend and the work-group data parallel kernels. + */ + +#ifndef PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_HIERARCHICAL_PREDICT_KERNEL_HPP_ +#define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_HIERARCHICAL_PREDICT_KERNEL_HPP_ +#pragma once + +#include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op +#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type + +#include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item + +#include // std::size_t +#include // std::tuple, std::make_tuple + +namespace plssvm::sycl::detail::hierarchical { + +/** + * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. + * @details Uses SYCL's hierarchical data parallel kernels. + */ +class device_kernel_w_linear { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in,out] w_d the vector to speedup the linear prediction + * @param[in] alpha_d the previously learned weights + * @param[in] sv_d the support vectors + * @param[in] num_classes the number of classes + * @param[in] num_sv the number of support vectors + * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for + * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + w_d_{ w_d }, + alpha_d_{ alpha_d }, + sv_d_{ sv_d }, + num_classes_{ num_classes }, + num_sv_{ num_sv }, + device_specific_num_sv_{ device_specific_num_sv }, + sv_offset_{ sv_offset }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] group indices representing the current point in the execution space + */ + void operator()(::sycl::group<2> group) const { + // allocate shared memory + real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + + // calculate the indices used in the current work-item + ::sycl::private_memory feature_idx{ group }; + ::sycl::private_memory feature_idx_linear{ group }; + ::sycl::private_memory class_idx{ group }; + ::sycl::private_memory class_idx_linear{ group }; + + ::sycl::private_memory temp{ group }; + + // initialize private and local variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension + const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension + const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension + const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + // indices + feature_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + feature_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + + // initialize private temp matrix to zero + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + } + } + }); + + // implicit group barrier + + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) { + // load data into local memory + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + const std::size_t threadIdx_x = idx.get_local_id(0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_feature_idx = feature_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + data_cache_feature[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x]; // SoA + data_cache_alpha[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x]; // AoS + } + }); + + // implicit group barrier + + // perform the dot product calculation + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + } + } + }); + + // implicit group barrier + } + + // update global array with local one + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + const auto global_class_idx = class_idx(idx) + static_cast(internal_class); + const auto global_feature_idx = feature_idx(idx) + static_cast(internal_feature); + + w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class]; + } + } + }); + } + + private: + /// @cond Doxygen_suppress + real_type *w_d_; + const real_type *alpha_d_; + const real_type *sv_d_; + const std::size_t num_classes_; + const std::size_t num_sv_; + const std::size_t device_specific_num_sv_; + const std::size_t sv_offset_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @details Uses SYCL's hierarchical data parallel kernels. + */ +class device_kernel_predict_linear { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[out] prediction_d the predicted values + * @param[in] w_d the vector to speedup the calculations + * @param[in] rho_d the previously learned bias + * @param[in] predict_points_d the data points to predict + * @param[in] num_classes the number of classes + * @param[in] num_predict_points the number of data points to predict + * @param[in] num_features the number of features per data point + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + prediction_d_{ prediction_d }, + w_d_{ w_d }, + rho_d_{ rho_d }, + predict_points_d_{ predict_points_d }, + num_classes_{ num_classes }, + num_predict_points_{ num_predict_points }, + num_features_{ num_features }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] group indices representing the current point in the execution space + */ + void operator()(::sycl::group<2> group) const { + // allocate shared memory + real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + + // calculate the indices used in the current work-item + ::sycl::private_memory pp_idx{ group }; + ::sycl::private_memory pp_idx_linear{ group }; + ::sycl::private_memory class_idx{ group }; + ::sycl::private_memory class_idx_linear{ group }; + + ::sycl::private_memory temp{ group }; + + // initialize private and local variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension + const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension + const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension + const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + // indices + pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + + // initialize private temp matrix to zero + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + } + } + }); + + // implicit group barrier + + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + const std::size_t threadIdx_x = idx.get_local_id(0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; + data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; + data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; + data_cache_w[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; + } + }); + + // implicit group barrier + + // perform the dot product calculation + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; + } + } + } + }); + + // implicit group barrier + } + + // update global array with local one + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + const auto global_class_idx = class_idx(idx) + static_cast(internal_class); + const auto global_pp_idx = pp_idx(idx) + static_cast(internal_pd); + + prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pd][internal_class] - rho_d_[global_class_idx]; + } + } + }); + } + + private: + /// @cond Doxygen_suppress + real_type *prediction_d_; + const real_type *w_d_; + const real_type *rho_d_; + const real_type *predict_points_d_; + const std::size_t num_classes_; + const std::size_t num_predict_points_; + const std::size_t num_features_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Predict the @p predict_points_d using the @p kernel_function. + * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam kernel_function the type of the used kernel function + * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` + */ +template +class device_kernel_predict { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] prediction_d the predicted values + * @param[in] alpha_d the previously learned weights + * @param[in] rho_d the previously learned biases + * @param[in] sv_d the support vectors + * @param[in] predict_points_d the data points to predict + * @param[in] num_classes the number of classes + * @param[in] num_sv the number of support vectors + * @param[in] num_predict_points the number of data points to predict + * @param[in] num_features the number of features per data point + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function + */ + device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + prediction_d_{ prediction_d }, + alpha_d_{ alpha_d }, + rho_d_{ rho_d }, + sv_d_{ sv_d }, + predict_points_d_{ predict_points_d }, + num_classes_{ num_classes }, + num_sv_{ num_sv }, + num_predict_points_{ num_predict_points }, + num_features_{ num_features }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] group indices representing the current point in the execution space + */ + void operator()(::sycl::group<2> group) const { + // allocate shared memory + real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + + // calculate the indices used in the current work-item + ::sycl::private_memory pp_idx{ group }; + ::sycl::private_memory pp_idx_linear{ group }; + ::sycl::private_memory sv_idx_linear{ group }; + + ::sycl::private_memory temp{ group }; + + // initialize private and local variables + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension + const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension + const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + // indices + pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + sv_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + + // initialize private temp matrix to zero + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + } + } + }); + + // implicit group barrier + + { + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + const std::size_t threadIdx_x = idx.get_local_id(0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; + data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; + data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + } + }); + + // implicit group barrier + + // perform the feature reduction calculation + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp(idx)[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]); + } + } + } + }); + + // implicit group barrier + } + } + + // update temp using the respective kernel function + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp(idx)[internal_pd][internal_sv] = detail::apply_kernel_function(temp(idx)[internal_pd][internal_sv], kernel_function_parameter_); + } + } + }); + + // implicit group barrier + + { + // rename cached arrays -> can't rename the arrays due to AdaptiveCpp runtime exception + // auto &alpha_cache = data_cache_pp; + // auto &out_cache = data_cache_sv; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + // load data into local memory + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t threadIdx_x = idx.get_local_id(0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + + // the bias (rho) must only be applied once for all support vectors + if (blockIdx_x == std::size_t{ 0 }) { + data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; + data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; + } else { + data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; + data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; + } + } + }); + + // implicit group barrier + + // calculate intermediate results and store them in local memory + for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + data_cache_sv[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_pd][internal_sv] * data_cache_pp[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + } + } + }); + + // implicit group barrier + } + + // add intermediate cached results to prediction_d + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(0)); + const auto local_id_1 = static_cast(idx.get_local_id(1)); + + const std::size_t threadIdx_x = idx.get_local_id(0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_pp_idx = pp_idx(idx) + static_cast(internal); + + detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1]; + } + }); + + // implicit group barrier + } + } + } + + private: + /// @cond Doxygen_suppress + real_type *prediction_d_; + const real_type *alpha_d_; + const real_type *rho_d_; + const real_type *sv_d_; + const real_type *predict_points_d_; + const std::size_t num_classes_; + const std::size_t num_sv_; + const std::size_t num_predict_points_; + const std::size_t num_features_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::tuple kernel_function_parameter_; + /// @endcond +}; + +} // namespace plssvm::sycl::detail::hierarchical + +#endif // PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_HIERARCHICAL_PREDICT_KERNEL_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp new file mode 100644 index 000000000..1a42161f5 --- /dev/null +++ b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp @@ -0,0 +1,498 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines the functions used for prediction for the C-SVM using the SYCL backend and AdaptiveCpp's scoped parallelism. + */ + +#ifndef PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_SCOPED_PREDICT_KERNEL_HPP_ +#define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_SCOPED_PREDICT_KERNEL_HPP_ +#pragma once + +#include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op +#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type + +#include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item + +#include // std::size_t +#include // std::tuple, std::make_tuple + +namespace plssvm::sycl::detail::scoped { + +/** + * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. + * @details Uses AdaptiveCpp's scoped parallelism. + */ +class device_kernel_w_linear { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in,out] w_d the vector to speedup the linear prediction + * @param[in] alpha_d the previously learned weights + * @param[in] sv_d the support vectors + * @param[in] num_classes the number of classes + * @param[in] num_sv the number of support vectors + * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for + * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + w_d_{ w_d }, + alpha_d_{ alpha_d }, + sv_d_{ sv_d }, + num_classes_{ num_classes }, + num_sv_{ num_sv }, + device_specific_num_sv_{ device_specific_num_sv }, + sv_offset_{ sv_offset }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @tparam T the implementation defined type of the group to iterate + * @param[in] group group representing the current point in the execution space + */ + template + void operator()(T group) const { + ::sycl::memory_environment(group, + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &data_cache_feature, auto &data_cache_alpha, auto &feature_idx, auto &feature_idx_linear, auto &class_idx, auto &class_idx_linear, auto &temp) { + // initialize private and local variables + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension + const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension + const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension + const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + // indices + feature_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + feature_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + }); + + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) { + // load data into local memory + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + const std::size_t threadIdx_x = idx.get_local_id(group, 0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_feature_idx = feature_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + data_cache_feature[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x]; // SoA + data_cache_alpha[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x]; // AoS + } + }); + + // perform the dot product calculation + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + } + } + }); + } + + // update global array with local one + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + const auto global_class_idx = class_idx(idx) + static_cast(internal_class); + const auto global_feature_idx = feature_idx(idx) + static_cast(internal_feature); + + w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class]; + } + } + }); + }); + } + + private: + /// @cond Doxygen_suppress + real_type *w_d_; + const real_type *alpha_d_; + const real_type *sv_d_; + const std::size_t num_classes_; + const std::size_t num_sv_; + const std::size_t device_specific_num_sv_; + const std::size_t sv_offset_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @details Uses AdaptiveCpp's scoped parallelism. + */ +class device_kernel_predict_linear { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[out] prediction_d the predicted values + * @param[in] w_d the vector to speedup the calculations + * @param[in] rho_d the previously learned bias + * @param[in] predict_points_d the data points to predict + * @param[in] num_classes the number of classes + * @param[in] num_predict_points the number of data points to predict + * @param[in] num_features the number of features per data point + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + */ + device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + prediction_d_{ prediction_d }, + w_d_{ w_d }, + rho_d_{ rho_d }, + predict_points_d_{ predict_points_d }, + num_classes_{ num_classes }, + num_predict_points_{ num_predict_points }, + num_features_{ num_features }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @tparam T the implementation defined type of the group to iterate + * @param[in] group group representing the current point in the execution space + */ + template + void operator()(T group) const { + ::sycl::memory_environment(group, + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &data_cache_pp, auto &data_cache_w, auto &pp_idx, auto &pp_idx_linear, auto &class_idx, auto &class_idx_linear, auto &temp) { + // initialize private and local variables + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension + const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension + const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension + const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + // indices + pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + }); + + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + const std::size_t threadIdx_x = idx.get_local_id(group, 0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; + data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; + data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; + data_cache_w[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; + } + }); + + // perform the dot product calculation + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; + } + } + } + }); + } + + // update global array with local one + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + const auto global_class_idx = class_idx(idx) + static_cast(internal_class); + const auto global_pp_idx = pp_idx(idx) + static_cast(internal_pd); + + prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pd][internal_class] - rho_d_[global_class_idx]; + } + } + }); + }); + } + + private: + /// @cond Doxygen_suppress + real_type *prediction_d_; + const real_type *w_d_; + const real_type *rho_d_; + const real_type *predict_points_d_; + const std::size_t num_classes_; + const std::size_t num_predict_points_; + const std::size_t num_features_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + /// @endcond +}; + +/** + * @brief Predict the @p predict_points_d using the @p kernel_function. + * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam kernel_function the type of the used kernel function + * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` + */ +template +class device_kernel_predict { + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] prediction_d the predicted values + * @param[in] alpha_d the previously learned weights + * @param[in] rho_d the previously learned biases + * @param[in] sv_d the support vectors + * @param[in] predict_points_d the data points to predict + * @param[in] num_classes the number of classes + * @param[in] num_sv the number of support vectors + * @param[in] num_predict_points the number of data points to predict + * @param[in] num_features the number of features per data point + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function + */ + device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + prediction_d_{ prediction_d }, + alpha_d_{ alpha_d }, + rho_d_{ rho_d }, + sv_d_{ sv_d }, + predict_points_d_{ predict_points_d }, + num_classes_{ num_classes }, + num_sv_{ num_sv }, + num_predict_points_{ num_predict_points }, + num_features_{ num_features }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @tparam T the implementation defined type of the group to iterate + * @param[in] group group representing the current point in the execution space + */ + template + void operator()(T group) const { + ::sycl::memory_environment(group, + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem(), + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &data_cache_pp, auto &data_cache_sv, auto &pp_idx, auto &pp_idx_linear, auto &sv_idx_linear, auto &temp) { + // initialize private and local variables + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension + const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension + const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + // indices + pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + sv_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + }); + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + const std::size_t threadIdx_x = idx.get_local_id(group, 0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; + data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; + data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + } + }); + + // perform the feature reduction calculation + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp(idx)[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]); + } + } + } + }); + } + + // update temp using the respective kernel function + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp(idx)[internal_pd][internal_sv] = detail::apply_kernel_function(temp(idx)[internal_pd][internal_sv], kernel_function_parameter_); + } + } + }); + + { + // rename cached arrays + auto &alpha_cache = data_cache_pp; + auto &out_cache = data_cache_sv; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + // load data into local memory + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const std::size_t threadIdx_x = idx.get_local_id(group, 0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + alpha_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + + // the bias (rho) must only be applied once for all support vectors + if (blockIdx_x == std::size_t{ 0 }) { + out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; + out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; + } else { + out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; + out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; + } + } + }); + + // calculate intermediate results and store them in local memory + for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + } + } + }); + } + + // add intermediate cached results to prediction_d + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + const std::size_t threadIdx_x = idx.get_local_id(group, 0); + + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_pp_idx = pp_idx(idx) + static_cast(internal); + + detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1]; + } + }); + } + } + }); + } + + private: + /// @cond Doxygen_suppress + real_type *prediction_d_; + const real_type *alpha_d_; + const real_type *rho_d_; + const real_type *sv_d_; + const real_type *predict_points_d_; + const std::size_t num_classes_; + const std::size_t num_sv_; + const std::size_t num_predict_points_; + const std::size_t num_features_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::tuple kernel_function_parameter_; + /// @endcond +}; + +} // namespace plssvm::sycl::detail::scoped + +#endif // PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_SCOPED_PREDICT_KERNEL_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp similarity index 97% rename from include/plssvm/backends/SYCL/kernel/predict_kernel.hpp rename to include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp index aa12069d0..d451ac7d5 100644 --- a/include/plssvm/backends/SYCL/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp @@ -6,11 +6,11 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * - * @brief Defines the functions used for prediction for the C-SVM using the SYCL backend. + * @brief Defines the functions used for prediction for the C-SVM using the SYCL backend and the work-group data parallel kernels. */ -#ifndef PLSSVM_BACKENDS_SYCL_PREDICT_KERNEL_HPP_ -#define PLSSVM_BACKENDS_SYCL_PREDICT_KERNEL_HPP_ +#ifndef PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_WORK_GROUP_PREDICT_KERNEL_HPP_ +#define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_WORK_GROUP_PREDICT_KERNEL_HPP_ #pragma once #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op @@ -18,15 +18,16 @@ #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type -#include "sycl/sycl.hpp" // sycl::item +#include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor #include // std::size_t #include // std::tuple, std::make_tuple -namespace plssvm::sycl::detail { +namespace plssvm::sycl::detail::work_group { /** * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. + * @details Uses SYCL's work-group data parallel kernels. */ class device_kernel_w_linear { public: @@ -140,6 +141,7 @@ class device_kernel_w_linear { /** * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @details Uses SYCL's work-group data parallel kernels. */ class device_kernel_predict_linear { public: @@ -257,6 +259,7 @@ class device_kernel_predict_linear { /** * @brief Predict the @p predict_points_d using the @p kernel_function. + * @details Uses SYCL's work-group data parallel kernels. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ @@ -430,6 +433,6 @@ class device_kernel_predict { /// @endcond }; -} // namespace plssvm::sycl::detail +} // namespace plssvm::sycl::detail::work_group -#endif // PLSSVM_BACKENDS_SYCL_PREDICT_KERNEL_HPP_ +#endif // PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_WORK_GROUP_PREDICT_KERNEL_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel_invocation_types.hpp b/include/plssvm/backends/SYCL/kernel_invocation_types.hpp index ec10b05e8..d7cec1924 100644 --- a/include/plssvm/backends/SYCL/kernel_invocation_types.hpp +++ b/include/plssvm/backends/SYCL/kernel_invocation_types.hpp @@ -17,6 +17,7 @@ #include "fmt/ostream.h" // fmt::ostream_formatter #include // forward declare std::ostream and std::istream +#include // std::vector namespace plssvm::sycl { @@ -26,10 +27,23 @@ namespace plssvm::sycl { enum class kernel_invocation_type { /** Use the best kernel invocation type for the current SYCL implementation and target hardware platform. */ automatic, - /** Use the [`nd_range` invocation type](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_invoke). */ - nd_range + /** Use the [`basic` data parallel kernels](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_basic_data_parallel_kernels). */ + basic, + /** Use the [`work-group` data parallel kernels](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_work_group_data_parallel_kernels). */ + work_group, + /** Use the [`hierarchical` data parallel kernels](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_hierarchical_data_parallel_kernels). **Note:** deprecated in newer SYCL version, will be replaced with a "better" version in future SYCL specifications. */ + hierarchical, + /** Use the AdaptiveCpp specific [`scoped` parallelism](https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/scoped-parallelism.md). */ + scoped }; +/** + * @brief Return a list of all currently available SYCL kernel invocation types. + * @details SYCL's hierarchical and AdaptiveCpp's scoped kernel invocation type can be disabled during the CMake configuration. + * @return the available SYCL kernel invocation types (`[[nodiscard]]`) + */ +[[nodiscard]] std::vector list_available_sycl_kernel_invocation_types(); + /** * @brief Output the @p invocation type to the given output-stream @p out. * @param[in,out] out the output-stream to write the backend type to diff --git a/include/plssvm/detail/cmd/parser_predict.hpp b/include/plssvm/detail/cmd/parser_predict.hpp index 5d930aa19..073e92f6c 100644 --- a/include/plssvm/detail/cmd/parser_predict.hpp +++ b/include/plssvm/detail/cmd/parser_predict.hpp @@ -13,11 +13,12 @@ #define PLSSVM_DETAIL_CMD_PARSER_PREDICT_HPP_ #pragma once -#include "plssvm/backend_types.hpp" // plssvm::backend_type -#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space -#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type -#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator -#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "fmt/base.h" // fmt::formatter #include "fmt/ostream.h" // fmt::ostream_formatter @@ -46,6 +47,8 @@ struct parser_predict { /// The target platform: automatic (depending on the used backend), CPUs or GPUs from NVIDIA, AMD, or Intel. target_platform target{ target_platform::automatic }; + /// The kernel invocation type when using SYCL as backend. + sycl::kernel_invocation_type sycl_kernel_invocation_type{ sycl::kernel_invocation_type::automatic }; /// The SYCL implementation to use with `--backend sycl`. sycl::implementation_type sycl_implementation_type{ sycl::implementation_type::automatic }; diff --git a/src/main_predict.cpp b/src/main_predict.cpp index f27ad2d2f..ee16daba2 100644 --- a/src/main_predict.cpp +++ b/src/main_predict.cpp @@ -129,7 +129,7 @@ int main(int argc, char *argv[]) { // create default csvm const std::unique_ptr svm = [&]() { if (use_sycl_as_backend) { - return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type); + return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type); } else if (use_kokkos_as_backend) { return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space); } else { diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt b/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt index 7dc8bb824..07b18e4f4 100644 --- a/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt +++ b/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt @@ -18,6 +18,13 @@ option(PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP "Enables the generic SSC if (PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP) message(STATUS "Using the new AdaptiveCpp SSCP compilation flow.") set(ACPP_TARGETS "generic" CACHE STRING "" FORCE) + if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS) + message( + WARNING "Enabled SYCL's hierarchical and AdapitveCpp's scoped kernels in AdaptiveCpp while using its SSCP compilation flow. " + "SSCP, however, does currently not implement these kernel invocation types resulting in a runtime exception. " + "If you wish to use them, set \"PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP\" to \"OFF\" and use one of the legacy compilation flows. " + ) + endif () else () message(STATUS "Using the old AdaptiveCpp compilation flow.") # reformat PLSSVM_TARGET_PLATFORMS to be usable with ACPP_TARGETS diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp index eaf394dd2..cf37bd48a 100644 --- a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp +++ b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp @@ -8,38 +8,50 @@ #include "plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp" -#include "plssvm/backend_types.hpp" // plssvm::backend_type -#include "plssvm/backends/execution_range.hpp" // plssvm::detail::{dim_type, execution_range} -#include "plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp" // plssvm::adaptivecpp::detail::::device_ptr -#include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue_impl.hpp" // plssvm::adaptivecpp::detail::queue (PImpl implementation) -#include "plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp" // plssvm::adaptivecpp::detail::{get_device_list, device_synchronize, get_adaptivecpp_version_short, get_adaptivecpp_version} -#include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::adaptivecpp::backend_exception -#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type -#include "plssvm/backends/SYCL/kernel/cg_explicit/blas.hpp" // plssvm::sycl::detail::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale} -#include "plssvm/backends/SYCL/kernel/cg_explicit/kernel_matrix_assembly.hpp" // plssvm::sycl::detail::device_kernel_assembly -#include "plssvm/backends/SYCL/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp" // plssvm::sycl::detail::device_kernel_assembly_symm -#include "plssvm/backends/SYCL/kernel/predict_kernel.hpp" // plssvm::sycl::detail::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict} -#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::kernel_invocation_type -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} -#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT -#include "plssvm/detail/data_distribution.hpp" // plssvm::detail::{data_distribution, triangular_data_distribution, rectangular_data_distribution} -#include "plssvm/detail/logging/log.hpp" // plssvm::detail::log -#include "plssvm/detail/logging/log_untracked.hpp" // plssvm::detail::log_untracked -#include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked -#include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size -#include "plssvm/detail/tracking/performance_tracker.hpp" // plssvm::detail::tracking::tracking_entry -#include "plssvm/detail/utility.hpp" // plssvm::detail::get_system_memory -#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception -#include "plssvm/gamma.hpp" // plssvm::gamma_type -#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_type -#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator -#include "plssvm/mpi/detail/information.hpp" // plssvm::mpi::detail::gather_and_print_csvm_information -#include "plssvm/parameter.hpp" // plssvm::parameter, plssvm::detail::parameter -#include "plssvm/shape.hpp" // plssvm::shape -#include "plssvm/target_platforms.hpp" // plssvm::target_platform -#include "plssvm/verbosity_levels.hpp" // plssvm::verbosity_level - -#include "sycl/sycl.hpp" // ::sycl::range, ::sycl::nd_range, ::sycl::handler, ::sycl::info::device +#include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/execution_range.hpp" // plssvm::detail::{dim_type, execution_range} +#include "plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp" // plssvm::adaptivecpp::detail::::device_ptr +#include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue_impl.hpp" // plssvm::adaptivecpp::detail::queue (PImpl implementation) +#include "plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp" // plssvm::adaptivecpp::detail::{get_device_list, device_synchronize, get_adaptivecpp_version_short, get_adaptivecpp_version} +#include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::adaptivecpp::backend_exception +#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type +#include "plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp" // plssvm::sycl::detail::basic::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale} +#include "plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp" // plssvm::sycl::detail::basic::device_kernel_assembly +#include "plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp" // plssvm::sycl::detail::hierarchical::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale} +#include "plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp" // plssvm::sycl::detail::hierarchical::device_kernel_assembly +#include "plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp" // plssvm::sycl::detail::scoped::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale} +#include "plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp" // plssvm::sycl::detail::scoped::device_kernel_assembly +#include "plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp" // plssvm::sycl::detail::work_group::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale} +#include "plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp" // plssvm::sycl::detail::work_group::device_kernel_assembly +#include "plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp" // plssvm::sycl::detail::basic::device_kernel_assembly_symm +#include "plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp" // plssvm::sycl::detail::hierarchical::device_kernel_assembly_symm +#include "plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp" // plssvm::sycl::detail::scoped::device_kernel_assembly_symm +#include "plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp" // plssvm::sycl::detail::work_group::device_kernel_assembly_symm +#include "plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp" // plssvm::sycl::detail::basic::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict} +#include "plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp" // plssvm::sycl::detail::hierarchical::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict} +#include "plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp" // plssvm::sycl::detail::scoped::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict} +#include "plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp" // plssvm::sycl::detail::work_group::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::kernel_invocation_type +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/data_distribution.hpp" // plssvm::detail::{data_distribution, triangular_data_distribution, rectangular_data_distribution} +#include "plssvm/detail/logging/log.hpp" // plssvm::detail::log +#include "plssvm/detail/logging/log_untracked.hpp" // plssvm::detail::log_untracked +#include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked +#include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size +#include "plssvm/detail/tracking/performance_tracker.hpp" // plssvm::detail::tracking::tracking_entry +#include "plssvm/detail/utility.hpp" // plssvm::detail::get_system_memory +#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception +#include "plssvm/gamma.hpp" // plssvm::gamma_type +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_type +#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator +#include "plssvm/mpi/detail/information.hpp" // plssvm::mpi::detail::gather_and_print_csvm_information +#include "plssvm/parameter.hpp" // plssvm::parameter, plssvm::detail::parameter +#include "plssvm/shape.hpp" // plssvm::shape +#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/verbosity_levels.hpp" // plssvm::verbosity_level + +#include "sycl/sycl.hpp" // sycl::range, sycl::nd_range, sycl::handler, sycl::info::device #include "fmt/color.h" // fmt::fg, fmt::color::orange #include "fmt/format.h" // fmt::format @@ -96,12 +108,12 @@ void csvm::init(const target_platform target) { // set correct kernel invocation type if "automatic" has been provided if (invocation_type_ == sycl::kernel_invocation_type::automatic) { - // always use nd_range for AdaptiveCpp - invocation_type_ = sycl::kernel_invocation_type::nd_range; - if (target_ == target_platform::cpu) { + // always use work_group for AdaptiveCpp + invocation_type_ = sycl::kernel_invocation_type::work_group; + if (target_ == target_platform::cpu) { // TODO: set to hierarchical or scoped?! #if !defined(__ACPP_USE_ACCELERATED_CPU__) && defined(__ACPP_ENABLE_OMPHOST_TARGET__) plssvm::detail::log_untracked(verbosity_level::full | verbosity_level::warning, - "WARNING: the AdaptiveCpp automatic target for the CPU is set to nd_range, but AdaptiveCpp hasn't been build with the \"omp.accelerated\" compilation flow resulting in major performance losses!\n"); + "WARNING: the AdaptiveCpp automatic target for the CPU is set to work_group, but AdaptiveCpp hasn't been build with the \"omp.accelerated\" compilation flow resulting in major performance losses!\n"); #endif } } @@ -120,7 +132,7 @@ void csvm::init(const target_platform target) { // use more detailed single rank command line output plssvm::detail::log_untracked(verbosity_level::full, comm_, - "\nUsing AdaptiveCpp ({}; {}) as SYCL backend with the kernel invocation type \"{}\" for the svm_kernel.\n", + "\nUsing AdaptiveCpp ({}; {}) as SYCL backend with the kernel invocation type \"{}\".\n", detail::get_adaptivecpp_version_short(), PLSSVM_ACPP_TARGETS, invocation_type_); @@ -242,50 +254,273 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons device_ptr_type kernel_matrix_d{ num_entries_padded, device }; // only explicitly store the upper triangular matrix const real_type cost_factor = real_type{ 1.0 } / params.cost; - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block); - for (const auto &[partial_grid, offsets] : exec.grids) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_block }; - switch (params.kernel_type) { + //***************************************************// + // linear kernel function // + //***************************************************// case kernel_function_type::linear: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - cgh.parallel_for(native_exec, sycl::detail::device_kernel_assembly{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::basic::device_kernel_assembly{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::work_group::device_kernel_assembly{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_assembly{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // polynomial kernel function // + //***************************************************// case kernel_function_type::polynomial: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly; - cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // radial-basis kernel function // + //***************************************************// case kernel_function_type::rbf: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly; - cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // sigmoid kernel function // + //***************************************************// case kernel_function_type::sigmoid: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly; - cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // laplacian kernel function // + //***************************************************// case kernel_function_type::laplacian: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly; - cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // chi-squared kernel function // + //***************************************************// case kernel_function_type::chi_squared: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly; - cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; } } @@ -304,35 +539,85 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const : // get the offset of the data points this device is responsible for const std::size_t row_offset = data_distribution_->place_row_offset(device_id); - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block); - for (const auto &[partial_grid, offsets] : exec.grids) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_block }; - - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - cgh.parallel_for(native_exec, sycl::detail::device_kernel_symm{ cgh, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::basic::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::work_group::device_kernel_symm{ cgh, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } } - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_mirror_block = detail::dim_type_to_native<2>(mirror_exec.block); - for (const auto &[partial_grid, offsets] : mirror_exec.grids) { const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows; if (num_mirror_rows > 0) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_mirror_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_mirror_block }; - - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - cgh.parallel_for(native_exec, sycl::detail::device_kernel_symm_mirror{ cgh, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, mirror_exec.block), + sycl::detail::basic::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, mirror_exec.block), + sycl::detail::work_group::device_kernel_symm_mirror{ cgh, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, mirror_exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, mirror_exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } } } detail::device_synchronize(device); @@ -342,16 +627,43 @@ void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plss const std::size_t num_rhs = lhs_d.shape().x; const queue_type &device = devices_[device_id]; - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block); - for (const auto &[partial_grid, offsets] : exec.grids) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_block }; - - device.impl->sycl_queue.parallel_for(native_exec, sycl::detail::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets.y, offsets.x }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::basic::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::work_group::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } } detail::device_synchronize(device); } @@ -360,16 +672,43 @@ void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm: const std::size_t num_rhs = lhs_d.shape().x; const queue_type &device = devices_[device_id]; - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block); - for (const auto &[partial_grid, offsets] : exec.grids) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_block }; - - device.impl->sycl_queue.parallel_for(native_exec, sycl::detail::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets.y, offsets.x }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::basic::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::work_group::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } } detail::device_synchronize(device); } @@ -387,50 +726,273 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de const real_type cost_factor = real_type{ 1.0 } / params.cost; - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block); - for (const auto &[partial_grid, offsets] : exec.grids) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_block }; - switch (params.kernel_type) { + //***************************************************// + // linear kernel function // + //***************************************************// case kernel_function_type::linear: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - cgh.parallel_for(native_exec, sycl::detail::device_kernel_assembly_symm{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::basic::device_kernel_assembly_symm{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::work_group::device_kernel_assembly_symm{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly_symm{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_assembly_symm{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // polynomial kernel function // + //***************************************************// case kernel_function_type::polynomial: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly_symm; - cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // radial-basis kernel function // + //***************************************************// case kernel_function_type::rbf: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly_symm; - cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // sigmoid kernel function // + //***************************************************// case kernel_function_type::sigmoid: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly_symm; - cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // laplacian kernel function // + //***************************************************// case kernel_function_type::laplacian: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly_symm; - cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // chi-squared kernel function // + //***************************************************// case kernel_function_type::chi_squared: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly_symm; - cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; } } @@ -453,18 +1015,42 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe device_ptr_type w_d{ shape{ num_classes, num_features }, shape{ PADDING_SIZE, PADDING_SIZE }, device }; - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block); - for (const auto &[partial_grid, offsets] : exec.grids) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_block }; - - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - cgh.parallel_for(native_exec, sycl::detail::device_kernel_w_linear{ cgh, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::basic::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::work_group::device_kernel_w_linear{ cgh, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + } } detail::device_synchronize(device); @@ -480,50 +1066,273 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai device_ptr_type out_d{ shape{ num_predict_points, num_classes }, shape{ PADDING_SIZE, PADDING_SIZE }, device }; - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block); - for (const auto &[partial_grid, offsets] : exec.grids) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_block }; - switch (params.kernel_type) { + //***************************************************// + // linear kernel function // + //***************************************************// case kernel_function_type::linear: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - cgh.parallel_for(native_exec, sycl::detail::device_kernel_predict_linear{ cgh, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::basic::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::work_group::device_kernel_predict_linear{ cgh, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // polynomial kernel function // + //***************************************************// case kernel_function_type::polynomial: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_predict; - cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // radial-basis kernel function // + //***************************************************// case kernel_function_type::rbf: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_predict; - cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // sigmoid kernel function // + //***************************************************// case kernel_function_type::sigmoid: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_predict; - cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // laplacian kernel function // + //***************************************************// case kernel_function_type::laplacian: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_predict; - cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; + //***************************************************// + // chi-squared kernel function // + //***************************************************// case kernel_function_type::chi_squared: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_predict; - cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::scoped::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + break; + } break; } } diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt index b7a0fb119..d0de8c7f2 100644 --- a/src/plssvm/backends/SYCL/CMakeLists.txt +++ b/src/plssvm/backends/SYCL/CMakeLists.txt @@ -20,6 +20,12 @@ set(PLSSVM_SYCL_BACKEND_FOUND_IMPLEMENTATIONS "") # check if SYCL can be enabled # ######################################################################################################################## +# enable kernel invocation types +option(PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS "Enables SYCL's hierarchical and AdaptiveCpp's scoped kernel invocation types." ON) +if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS) + message(STATUS "Enable SYCL's hierarchical and AdaptiveCpp's scoped kernel invocation types.") +endif () + # add AdaptiveCpp set(PLSSVM_ENABLE_SYCL_ADAPTIVECPP_BACKEND ${PLSSVM_ENABLE_SYCL_BACKEND} CACHE STRING "Enable AdaptiveCpp as SYCL Backend") set_property(CACHE PLSSVM_ENABLE_SYCL_ADAPTIVECPP_BACKEND PROPERTY STRINGS AUTO ON OFF) @@ -108,6 +114,24 @@ if (TARGET ${PLSSVM_SYCL_BACKEND_DPCPP_LIBRARY_NAME}) ) endif () +# add kernel invocation type compile definitions +if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS) + target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} INTERFACE PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + if (TARGET ${PLSSVM_SYCL_BACKEND_ADAPTIVECPP_LIBRARY_NAME}) + target_compile_definitions(${PLSSVM_SYCL_BACKEND_ADAPTIVECPP_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + endif () + if (TARGET ${PLSSVM_SYCL_BACKEND_DPCPP_LIBRARY_NAME}) + target_compile_definitions(${PLSSVM_SYCL_BACKEND_DPCPP_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + endif () + + # create name list + set(PLSSVM_SYCL_KERNEL_HIERARCHICAL_AND_SCOPED_NAME_LIST ";hierarchical") + if (TARGET ${PLSSVM_SYCL_BACKEND_ADAPTIVECPP_LIBRARY_NAME}) + list(APPEND PLSSVM_SYCL_KERNEL_HIERARCHICAL_AND_SCOPED_NAME_LIST "scoped") + endif () +endif () + # link against interface library target_link_libraries(${PLSSVM_ALL_LIBRARY_NAME} INTERFACE ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME}) @@ -117,8 +141,14 @@ target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PRIVATE PLSSVM_HAS_SYCL_B # mark backend library as install target append_local_and_parent(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME}) -# set manpage string +# set manpage strings set_local_and_parent(PLSSVM_SYCL_BACKEND_NAME_LIST "automatic;${PLSSVM_SYCL_BACKEND_FOUND_IMPLEMENTATIONS}") +set_local_and_parent(PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST "automatic;basic;work_group${PLSSVM_SYCL_KERNEL_HIERARCHICAL_AND_SCOPED_NAME_LIST}") + +# populate transformed ACPP_TARGETS for tests +if (TARGET ${PLSSVM_SYCL_BACKEND_ADAPTIVECPP_LIBRARY_NAME}) + set_local_and_parent(ACPP_TARGETS "${ACPP_TARGETS}") +endif () # populate transformed ACPP_TARGETS for tests if (TARGET ${PLSSVM_SYCL_BACKEND_ADAPTIVECPP_LIBRARY_NAME}) diff --git a/src/plssvm/backends/SYCL/DPCPP/CMakeLists.txt b/src/plssvm/backends/SYCL/DPCPP/CMakeLists.txt index d3e53ba83..b52eb5d25 100644 --- a/src/plssvm/backends/SYCL/DPCPP/CMakeLists.txt +++ b/src/plssvm/backends/SYCL/DPCPP/CMakeLists.txt @@ -25,6 +25,14 @@ if (PLSSVM_SYCL_BACKEND_CHECK_FOR_DPCPP_COMPILER) message(CHECK_PASS "found") append_local_and_parent(PLSSVM_SYCL_BACKEND_FOUND_IMPLEMENTATIONS "dpcpp") + # check if the hierarchical kernel is enabled while the build type is Debug -> throw a warning + if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS AND uppercase_CMAKE_BUILD_TYPE MATCHES DEBUG) + message(WARNING "Enabled SYCL's hierarchical kernels in DPC++ while using Debug as build type. " + "This may result in compilation errors during PTX code generation. " + "If this is the case, either set \"PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS\" to \"OFF\" or use another build type." + ) + endif () + # set DPC++ specific targets set(PLSSVM_SYCL_DPCPP_SOURCES ${CMAKE_CURRENT_LIST_DIR}/detail/device_ptr.cpp ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp diff --git a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp index 2d3c85ec8..34dbf083f 100644 --- a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp +++ b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp @@ -8,36 +8,44 @@ #include "plssvm/backends/SYCL/DPCPP/csvm.hpp" -#include "plssvm/backend_types.hpp" // plssvm::backend_type -#include "plssvm/backends/execution_range.hpp" // plssvm::detail::{dim_type, execution_range} -#include "plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp" // plssvm::dpcpp::detail::::device_ptr -#include "plssvm/backends/SYCL/DPCPP/detail/queue_impl.hpp" // plssvm::dpcpp::detail::queue (PImpl implementation) -#include "plssvm/backends/SYCL/DPCPP/detail/utility.hpp" // plssvm::dpcpp::detail::{get_device_list, device_synchronize, get_dpcpp_version} -#include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::dpcpp::backend_exception -#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type -#include "plssvm/backends/SYCL/kernel/cg_explicit/blas.hpp" // plssvm::sycl::detail::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale} -#include "plssvm/backends/SYCL/kernel/cg_explicit/kernel_matrix_assembly.hpp" // plssvm::sycl::detail::device_kernel_assembly -#include "plssvm/backends/SYCL/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp" // plssvm::sycl::detail::device_kernel_assembly_symm -#include "plssvm/backends/SYCL/kernel/predict_kernel.hpp" // plssvm::sycl::detail::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict} -#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::kernel_invocation_type -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} -#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT -#include "plssvm/detail/data_distribution.hpp" // plssvm::detail::{data_distribution, triangular_data_distribution, rectangular_data_distribution} -#include "plssvm/detail/logging/log_untracked.hpp" // plssvm::detail::log_untracked -#include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked -#include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size -#include "plssvm/detail/tracking/performance_tracker.hpp" // plssvm::detail::tracking::tracking_entry -#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception -#include "plssvm/gamma.hpp" // plssvm::gamma_type -#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_type -#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator -#include "plssvm/mpi/detail/information.hpp" // plssvm::mpi::detail::gather_and_print_csvm_information -#include "plssvm/parameter.hpp" // plssvm::parameter -#include "plssvm/shape.hpp" // plssvm::shape -#include "plssvm/target_platforms.hpp" // plssvm::target_platform -#include "plssvm/verbosity_levels.hpp" // plssvm::verbosity_level - -#include "sycl/sycl.hpp" // sycl::queue, sycl::range, sycl::nd_range, sycl::handler, sycl::info::device +#include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/execution_range.hpp" // plssvm::detail::{dim_type, execution_range} +#include "plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp" // plssvm::dpcpp::detail::::device_ptr +#include "plssvm/backends/SYCL/DPCPP/detail/queue_impl.hpp" // plssvm::dpcpp::detail::queue (PImpl implementation) +#include "plssvm/backends/SYCL/DPCPP/detail/utility.hpp" // plssvm::dpcpp::detail::{get_device_list, device_synchronize, get_dpcpp_version} +#include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::dpcpp::backend_exception +#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type +#include "plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp" // plssvm::sycl::detail::basic::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale} +#include "plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp" // plssvm::sycl::detail::basic::device_kernel_assembly +#include "plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp" // plssvm::sycl::detail::hierarchical::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale} +#include "plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp" // plssvm::sycl::detail::hierarchical::device_kernel_assembly +#include "plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp" // plssvm::sycl::detail::work_group::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale} +#include "plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp" // plssvm::sycl::detail::work_group::device_kernel_assembly +#include "plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp" // plssvm::sycl::detail::basic::device_kernel_assembly_symm +#include "plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp" // plssvm::sycl::detail::hierarchical::device_kernel_assembly_symm +#include "plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp" // plssvm::sycl::detail::work_group::device_kernel_assembly_symm +#include "plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp" // plssvm::sycl::detail::basic::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict} +#include "plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp" // plssvm::sycl::detail::hierarchical::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict} +#include "plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp" // plssvm::sycl::detail::work_group::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::kernel_invocation_type +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/data_distribution.hpp" // plssvm::detail::{data_distribution, triangular_data_distribution, rectangular_data_distribution} +#include "plssvm/detail/logging/log_untracked.hpp" // plssvm::detail::log_untracked +#include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked +#include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size +#include "plssvm/detail/tracking/performance_tracker.hpp" // plssvm::detail::tracking::tracking_entry +#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception +#include "plssvm/gamma.hpp" // plssvm::gamma_type +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_type +#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator +#include "plssvm/mpi/detail/information.hpp" // plssvm::mpi::detail::gather_and_print_csvm_information +#include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/shape.hpp" // plssvm::shape +#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/verbosity_levels.hpp" // plssvm::verbosity_level + +#include "sycl/sycl.hpp" // sycl::range, sycl::nd_range, sycl::handler, sycl::info::device #include "fmt/color.h" // fmt::fg, fmt::color::orange #include "fmt/format.h" // fmt::format @@ -94,8 +102,8 @@ void csvm::init(const target_platform target) { // set correct kernel invocation type if "automatic" has been provided if (invocation_type_ == sycl::kernel_invocation_type::automatic) { - // always use nd_range for DPC++ - invocation_type_ = sycl::kernel_invocation_type::nd_range; + // always use work_group for DPC++ + invocation_type_ = sycl::kernel_invocation_type::work_group; } std::vector device_names{}; @@ -112,7 +120,7 @@ void csvm::init(const target_platform target) { // use more detailed single rank command line output plssvm::detail::log_untracked(verbosity_level::full, comm_, - "\nUsing DPC++ ({}; {}) as SYCL backend with the kernel invocation type \"{}\" for the svm_kernel.\n", + "\nUsing DPC++ ({}; {}) as SYCL backend with the kernel invocation type \"{}\".\n", detail::get_dpcpp_version(), detail::get_dpcpp_timestamp_version(), invocation_type_); @@ -159,7 +167,7 @@ csvm::~csvm() { for (const queue_type &q : devices_) { detail::device_synchronize(q); } - } catch (const plssvm::exception &e) { + } catch (const std::exception &e) { std::cout << e.what() << std::endl; std::terminate(); } @@ -229,50 +237,220 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons device_ptr_type kernel_matrix_d{ num_entries_padded, device }; // only explicitly store the upper triangular matrix const real_type cost_factor = real_type{ 1.0 } / params.cost; - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block); - for (const auto &[partial_grid, offsets] : exec.grids) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_block }; - switch (params.kernel_type) { + //***************************************************// + // linear kernel function // + //***************************************************// case kernel_function_type::linear: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - cgh.parallel_for(native_exec, sycl::detail::device_kernel_assembly{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::basic::device_kernel_assembly{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::work_group::device_kernel_assembly{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // polynomial kernel function // + //***************************************************// case kernel_function_type::polynomial: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly; - cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // radial-basis kernel function // + //***************************************************// case kernel_function_type::rbf: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly; - cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // sigmoid kernel function // + //***************************************************// case kernel_function_type::sigmoid: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly; - cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // laplacian kernel function // + //***************************************************// case kernel_function_type::laplacian: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly; - cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // chi-squared kernel function // + //***************************************************// case kernel_function_type::chi_squared: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly; - cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; } } @@ -291,35 +469,69 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const : // get the offset of the data points this device is responsible for const std::size_t row_offset = data_distribution_->place_row_offset(device_id); - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block); - for (const auto &[partial_grid, offsets] : exec.grids) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_block }; - - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - cgh.parallel_for(native_exec, sycl::detail::device_kernel_symm{ cgh, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::basic::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::work_group::device_kernel_symm{ cgh, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } } - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_mirror_block = detail::dim_type_to_native<2>(mirror_exec.block); - for (const auto &[partial_grid, offsets] : mirror_exec.grids) { const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows; if (num_mirror_rows > 0) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_mirror_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_mirror_block }; - - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - cgh.parallel_for(native_exec, sycl::detail::device_kernel_symm_mirror{ cgh, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, mirror_exec.block), + sycl::detail::basic::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, mirror_exec.block), + sycl::detail::work_group::device_kernel_symm_mirror{ cgh, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, mirror_exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } } } detail::device_synchronize(device); @@ -329,16 +541,35 @@ void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plss const std::size_t num_rhs = lhs_d.shape().x; const queue_type &device = devices_[device_id]; - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block); - for (const auto &[partial_grid, offsets] : exec.grids) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_block }; - - device.impl->sycl_queue.parallel_for(native_exec, sycl::detail::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets.y, offsets.x }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::basic::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::work_group::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } } detail::device_synchronize(device); } @@ -347,16 +578,35 @@ void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm: const std::size_t num_rhs = lhs_d.shape().x; const queue_type &device = devices_[device_id]; - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block); - for (const auto &[partial_grid, offsets] : exec.grids) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_block }; - - device.impl->sycl_queue.parallel_for(native_exec, sycl::detail::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets.y, offsets.x }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::basic::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::work_group::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } } detail::device_synchronize(device); } @@ -374,50 +624,220 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de const real_type cost_factor = real_type{ 1.0 } / params.cost; - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block); - for (const auto &[partial_grid, offsets] : exec.grids) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_block }; - switch (params.kernel_type) { + //***************************************************// + // linear kernel function // + //***************************************************// case kernel_function_type::linear: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - cgh.parallel_for(native_exec, sycl::detail::device_kernel_assembly_symm{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::basic::device_kernel_assembly_symm{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::work_group::device_kernel_assembly_symm{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly_symm{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // polynomial kernel function // + //***************************************************// case kernel_function_type::polynomial: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly_symm; - cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // radial-basis kernel function // + //***************************************************// case kernel_function_type::rbf: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly_symm; - cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // sigmoid kernel function // + //***************************************************// case kernel_function_type::sigmoid: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly_symm; - cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // laplacian kernel function // + //***************************************************// case kernel_function_type::laplacian: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly_symm; - cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // chi-squared kernel function // + //***************************************************// case kernel_function_type::chi_squared: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_assembly_symm; - cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_assembly_symm; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; } } @@ -440,18 +860,35 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe device_ptr_type w_d{ shape{ num_classes, num_features }, shape{ PADDING_SIZE, PADDING_SIZE }, device }; - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block); - for (const auto &[partial_grid, offsets] : exec.grids) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_block }; - - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - cgh.parallel_for(native_exec, sycl::detail::device_kernel_w_linear{ cgh, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::basic::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::work_group::device_kernel_w_linear{ cgh, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } } detail::device_synchronize(device); @@ -467,50 +904,220 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai device_ptr_type out_d{ shape{ num_predict_points, num_classes }, shape{ PADDING_SIZE, PADDING_SIZE }, device }; - // convert execution range block to SYCL's native range<2> - const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block); - for (const auto &[partial_grid, offsets] : exec.grids) { - // convert execution range partial_grid to SYCL's native range<2> - const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block; - - const ::sycl::nd_range native_exec{ native_partial_grid, native_block }; - switch (params.kernel_type) { + //***************************************************// + // linear kernel function // + //***************************************************// case kernel_function_type::linear: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - cgh.parallel_for(native_exec, sycl::detail::device_kernel_predict_linear{ cgh, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::basic::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + sycl::detail::work_group::device_kernel_predict_linear{ cgh, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // polynomial kernel function // + //***************************************************// case kernel_function_type::polynomial: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_predict; - cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // radial-basis kernel function // + //***************************************************// case kernel_function_type::rbf: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_predict; - cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // sigmoid kernel function // + //***************************************************// case kernel_function_type::sigmoid: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_predict; - cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // laplacian kernel function // + //***************************************************// case kernel_function_type::laplacian: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_predict; - cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; + //***************************************************// + // chi-squared kernel function // + //***************************************************// case kernel_function_type::chi_squared: - device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::device_kernel_predict; - cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); - }); + switch (invocation_type_) { + case sycl::kernel_invocation_type::automatic: + throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" }; + case sycl::kernel_invocation_type::basic: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::basic::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::work_group: + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::work_group::device_kernel_predict; + cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), + functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); + break; + case sycl::kernel_invocation_type::hierarchical: +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { + using functor_type = sycl::detail::hierarchical::device_kernel_predict; + const auto exec_range = detail::get_execution_range(partial_grid_ref, exec.block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); + }); +#else + throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + break; + case sycl::kernel_invocation_type::scoped: + throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" }; + } break; } } diff --git a/src/plssvm/backends/SYCL/kernel_invocation_types.cpp b/src/plssvm/backends/SYCL/kernel_invocation_types.cpp index 58a73ca26..87ee18f26 100644 --- a/src/plssvm/backends/SYCL/kernel_invocation_types.cpp +++ b/src/plssvm/backends/SYCL/kernel_invocation_types.cpp @@ -14,15 +14,37 @@ #include // std::istream #include // std::ostream #include // std::string +#include // std::vector namespace plssvm::sycl { +std::vector list_available_sycl_kernel_invocation_types() { + std::vector available_sycl_kernel_invocation_types = { + kernel_invocation_type::automatic, + kernel_invocation_type::basic, + kernel_invocation_type::work_group + }; +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + available_sycl_kernel_invocation_types.push_back(kernel_invocation_type::hierarchical); + #if defined(PLSSVM_SYCL_BACKEND_HAS_ADAPTIVECPP) + available_sycl_kernel_invocation_types.push_back(kernel_invocation_type::scoped); + #endif +#endif + return available_sycl_kernel_invocation_types; +} + std::ostream &operator<<(std::ostream &out, const kernel_invocation_type invocation) { switch (invocation) { case kernel_invocation_type::automatic: return out << "automatic"; - case kernel_invocation_type::nd_range: - return out << "nd_range"; + case kernel_invocation_type::basic: + return out << "basic"; + case kernel_invocation_type::work_group: + return out << "work_group"; + case kernel_invocation_type::hierarchical: + return out << "hierarchical"; + case kernel_invocation_type::scoped: + return out << "scoped"; } return out << "unknown"; } @@ -34,8 +56,14 @@ std::istream &operator>>(std::istream &in, kernel_invocation_type &invocation) { if (str == "automatic" || str == "auto") { invocation = kernel_invocation_type::automatic; - } else if (str == "nd_range") { - invocation = kernel_invocation_type::nd_range; + } else if (str == "basic") { + invocation = kernel_invocation_type::basic; + } else if (str == "work_group" || str == "work-group" || str == "nd_range" || str == "nd-range") { + invocation = kernel_invocation_type::work_group; + } else if (str == "hierarchical") { + invocation = kernel_invocation_type::hierarchical; + } else if (str == "scoped") { + invocation = kernel_invocation_type::scoped; } else { in.setstate(std::ios::failbit); } diff --git a/src/plssvm/detail/cmd/parser_predict.cpp b/src/plssvm/detail/cmd/parser_predict.cpp index c90bb62d1..ed4476e96 100644 --- a/src/plssvm/detail/cmd/parser_predict.cpp +++ b/src/plssvm/detail/cmd/parser_predict.cpp @@ -8,17 +8,18 @@ #include "plssvm/detail/cmd/parser_predict.hpp" -#include "plssvm/backend_types.hpp" // plssvm::list_available_backends -#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::list_available_execution_spaces -#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::list_available_sycl_implementations -#include "plssvm/constants.hpp" // plssvm::real_type -#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT -#include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked -#include "plssvm/exceptions/exceptions.hpp" // plssvm::cmd_parser_exit -#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator -#include "plssvm/target_platforms.hpp" // plssvm::list_available_target_platforms -#include "plssvm/verbosity_levels.hpp" // plssvm::verbosity, plssvm::verbosity_level -#include "plssvm/version/version.hpp" // plssvm::version::detail::get_version_info +#include "plssvm/backend_types.hpp" // plssvm::list_available_backends +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::list_available_execution_spaces +#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::list_available_sycl_implementations +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::{list_available_sycl_kernel_invocation_types, kernel_invocation_type} +#include "plssvm/constants.hpp" // plssvm::real_type +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked +#include "plssvm/exceptions/exceptions.hpp" // plssvm::cmd_parser_exit +#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator +#include "plssvm/target_platforms.hpp" // plssvm::list_available_target_platforms +#include "plssvm/verbosity_levels.hpp" // plssvm::verbosity, plssvm::verbosity_level +#include "plssvm/version/version.hpp" // plssvm::version::detail::get_version_info #include "cxxopts.hpp" // cxxopts::{Options, value, ParseResult} #include "fmt/color.h" // fmt::fg, fmt::color::orange @@ -52,6 +53,7 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a ("b,backend", fmt::format("choose the backend: {}", fmt::join(list_available_backends(), "|")), cxxopts::value()->default_value(fmt::format("{}", backend))) ("p,target_platform", fmt::format("choose the target platform: {}", fmt::join(list_available_target_platforms(), "|")), cxxopts::value()->default_value(fmt::format("{}", target))) #if defined(PLSSVM_HAS_SYCL_BACKEND) + ("sycl_kernel_invocation_type", fmt::format("choose the kernel invocation type when using SYCL as backend: {}", fmt::join(sycl::list_available_sycl_kernel_invocation_types(), "|")), cxxopts::value()->default_value(fmt::format("{}", sycl_kernel_invocation_type))) ("sycl_implementation_type", fmt::format("choose the SYCL implementation to be used in the SYCL backend: {}", fmt::join(sycl::list_available_sycl_implementations(), "|")), cxxopts::value()->default_value(fmt::format("{}", sycl_implementation_type))) #endif #if defined(PLSSVM_HAS_KOKKOS_BACKEND) @@ -119,13 +121,24 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a #if defined(PLSSVM_HAS_SYCL_BACKEND) { - // parse SYCL implementation used in the SYCL backend - sycl_implementation_type = result["sycl_implementation_type"].as(); + // parse kernel invocation type when using SYCL as backend + sycl_kernel_invocation_type = result["sycl_kernel_invocation_type"].as(); - // assembly warning condition + // assemble warning condition const std::vector target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target }; const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl); + // warn if kernel invocation type is explicitly set but SYCL isn't the current (automatic) backend + if (!sycl_backend_is_used && sycl_kernel_invocation_type != sycl::kernel_invocation_type::automatic) { + detail::log_untracked(verbosity_level::full | verbosity_level::warning, + comm, + "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}\n", + sycl_kernel_invocation_type); + } + + // parse SYCL implementation used in the SYCL backend + sycl_implementation_type = result["sycl_implementation_type"].as(); + // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) { detail::log_untracked(verbosity_level::full | verbosity_level::warning, @@ -237,7 +250,11 @@ std::ostream &operator<<(std::ostream &out, const parser_predict ¶ms) { params.target); if (params.backend == backend_type::sycl || params.backend == backend_type::automatic) { - out << fmt::format("SYCL implementation type: {}\n", params.sycl_implementation_type); + out << fmt::format( + "SYCL implementation type: {}\n" + "SYCL kernel invocation type: {}\n", + params.sycl_implementation_type, + params.sycl_kernel_invocation_type); } if (params.backend == backend_type::kokkos || params.backend == backend_type::automatic) { diff --git a/src/plssvm/detail/cmd/parser_train.cpp b/src/plssvm/detail/cmd/parser_train.cpp index fdb0070c9..b47422a0f 100644 --- a/src/plssvm/detail/cmd/parser_train.cpp +++ b/src/plssvm/detail/cmd/parser_train.cpp @@ -11,7 +11,7 @@ #include "plssvm/backend_types.hpp" // plssvm::list_available_backends, plssvm::determine_default_backend #include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::{list_available_execution_spaces, execution_space} #include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::{list_available_sycl_implementations, implementation_type} -#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::{list_available_sycl_kernel_invocation_types, kernel_invocation_type} #include "plssvm/classification_types.hpp" // plssvm::classification_type, plssvm::classification_type_to_full_string #include "plssvm/constants.hpp" // plssvm::real_type #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT @@ -80,7 +80,7 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv) ("b,backend", fmt::format("choose the backend: {}", fmt::join(list_available_backends(), "|")), cxxopts::value()->default_value(fmt::format("{}", backend))) ("p,target_platform", fmt::format("choose the target platform: {}", fmt::join(list_available_target_platforms(), "|")), cxxopts::value()->default_value(fmt::format("{}", target))) #if defined(PLSSVM_HAS_SYCL_BACKEND) - ("sycl_kernel_invocation_type", "choose the kernel invocation type when using SYCL as backend: automatic|nd_range", cxxopts::value()->default_value(fmt::format("{}", sycl_kernel_invocation_type))) + ("sycl_kernel_invocation_type", fmt::format("choose the kernel invocation type when using SYCL as backend: {}", fmt::join(sycl::list_available_sycl_kernel_invocation_types(), "|")), cxxopts::value()->default_value(fmt::format("{}", sycl_kernel_invocation_type))) ("sycl_implementation_type", fmt::format("choose the SYCL implementation to be used in the SYCL backend: {}", fmt::join(sycl::list_available_sycl_implementations(), "|")), cxxopts::value()->default_value(fmt::format("{}", sycl_implementation_type))) #endif #if defined(PLSSVM_HAS_KOKKOS_BACKEND) diff --git a/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp b/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp index 448c9bc97..446374ef6 100644 --- a/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp +++ b/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp @@ -14,7 +14,7 @@ #include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/detail/arithmetic_type_name.hpp" // plssvm::detail::arithmetic_type_name #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type -#include "plssvm/parameter.hpp" // plssvm::parameter, plssvm::kernel_type, plssvm::cost +#include "plssvm/parameter.hpp" // plssvm::parameter, plssvm::kernel_type, plssvm::cost, plssvm::sycl_kernel_invocation_type #include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "tests/backends/generic_base_csvc_tests.hpp" // generic C-SVC tests to instantiate @@ -49,7 +49,7 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, default_construct) { // default constructor must always work EXPECT_NO_THROW(csvm_type{}); - EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); } TYPED_TEST(AdaptiveCppCSVMConstructor, construct_parameter) { @@ -57,7 +57,7 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_parameter) { // the automatic target platform must always be available EXPECT_NO_THROW(csvm_type{ plssvm::parameter{} }); - EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); } TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_parameter) { @@ -69,33 +69,33 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_parameter) { // every target is allowed for SYCL #if defined(PLSSVM_HAS_CPU_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else - EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::adaptivecpp::backend_exception, "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif #if defined(PLSSVM_HAS_NVIDIA_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else - EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::adaptivecpp::backend_exception, "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif #if defined(PLSSVM_HAS_AMD_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else - EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::adaptivecpp::backend_exception, "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif #if defined(PLSSVM_HAS_INTEL_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else - EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::adaptivecpp::backend_exception, "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif @@ -107,7 +107,7 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_named_args) { // every target is allowed for SYCL EXPECT_NO_THROW((csvm_type{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); EXPECT_NO_THROW((csvm_type{ plssvm::cost = 2.0 })); - EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); } TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_named_args) { @@ -117,48 +117,48 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_named_args) { #if defined(PLSSVM_HAS_CPU_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::cost = 2.0 })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, - plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::adaptivecpp::backend_exception, "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif #if defined(PLSSVM_HAS_NVIDIA_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::cost = 2.0 })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, - plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::adaptivecpp::backend_exception, "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif #if defined(PLSSVM_HAS_AMD_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::cost = 2.0 })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, - plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::adaptivecpp::backend_exception, "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif #if defined(PLSSVM_HAS_INTEL_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::cost = 2.0 })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, - plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::adaptivecpp::backend_exception, "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif @@ -174,18 +174,24 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, get_kernel_invocation_type) { EXPECT_NE(svm.get_kernel_invocation_type(), plssvm::sycl::kernel_invocation_type::automatic); } -template +template struct adaptivecpp_csvm_test_type { using mock_csvm_type = mock_adaptivecpp_csvm; using csvm_type = plssvm::adaptivecpp::csvm; using csvc_type = plssvm::adaptivecpp::csvc; using csvr_type = plssvm::adaptivecpp::csvr; using device_ptr_type = typename csvm_type::device_ptr_type; - inline constexpr static auto additional_arguments = std::make_tuple(); + inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::sycl_kernel_invocation_type, invocation_type)); }; // a tuple containing the test structs -using adaptivecpp_csvm_test_tuple = std::tuple>; +using adaptivecpp_csvm_test_tuple = std::tuple< +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + adaptivecpp_csvm_test_type, + adaptivecpp_csvm_test_type, +#endif + adaptivecpp_csvm_test_type, + adaptivecpp_csvm_test_type>; // the tests used in the instantiated GTest test suites // general test types @@ -231,7 +237,14 @@ INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppCSVM, GenericGPUCSVMKernelFunction, ad // generic GPU C-SVM DeathTests - correct grid sizes INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppCSVMDeathTest, GenericGPUCSVMDeathTest, adaptivecpp_csvm_test_type_gtest, naming::test_parameter_to_name); -using adaptivecpp_mock_csvm_test_tuple = std::tuple>; +using adaptivecpp_mock_csvm_test_tuple = std::tuple< +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + adaptivecpp_csvm_test_type, + adaptivecpp_csvm_test_type, +#endif + adaptivecpp_csvm_test_type, + adaptivecpp_csvm_test_type>; + using adaptivecpp_mock_csvm_test_type_list = util::cartesian_type_product_t; using adaptivecpp_mock_csvm_test_type_gtest = util::combine_test_parameters_gtest_t; diff --git a/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp b/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp index a3097380c..e909fa2a9 100644 --- a/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp +++ b/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp @@ -10,10 +10,11 @@ #include "plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp" -#include "plssvm/backends/execution_range.hpp" // plssvm::detail::dim_type -#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/backends/execution_range.hpp" // plssvm::detail::dim_type +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform -#include "sycl/sycl.hpp" // sycl::range +#include "sycl/sycl.hpp" // sycl::range, sycl::nd_range #include "gtest/gtest.h" // TEST, EXPECT_NE, EXPECT_FALSE @@ -60,6 +61,50 @@ TEST(AdaptiveCppUtility, dim_type_to_native_3) { EXPECT_EQ(native_dim[2], dim.x); } +TEST(AdaptiveCppUtility, get_execution_range_basic) { + // create a grid + const plssvm::detail::dim_type grid{ 64ull, 64ull }; + const plssvm::detail::dim_type block{ 8ull, 8ull }; + + // calculate the SYCL execution range + const ::sycl::range exec = plssvm::adaptivecpp::detail::get_execution_range(grid, block); + + EXPECT_EQ(exec, (sycl::range<2>{ 512ull, 512ull })); +} + +TEST(AdaptiveCppUtility, get_execution_range_work_group) { + // create a grid + const plssvm::detail::dim_type grid{ 64ull, 64ull }; + const plssvm::detail::dim_type block{ 8ull, 8ull }; + + // calculate the SYCL execution range + const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range(grid, block); + + EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 512ull, 512ull }, ::sycl::range<2>{ 8ull, 8ull } })); +} + +TEST(AdaptiveCppUtility, get_execution_range_hierarchical) { + // create a grid + const plssvm::detail::dim_type grid{ 64ull, 64ull }; + const plssvm::detail::dim_type block{ 8ull, 8ull }; + + // calculate the SYCL execution range + const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range(grid, block); + + EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 64ull, 64ull }, ::sycl::range<2>{ 8ull, 8ull } })); +} + +TEST(AdaptiveCppUtility, get_execution_range_scoped) { + // create a grid + const plssvm::detail::dim_type grid{ 64ull, 64ull }; + const plssvm::detail::dim_type block{ 8ull, 8ull }; + + // calculate the SYCL execution range + const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range(grid, block); + + EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 64ull, 64ull }, ::sycl::range<2>{ 8ull, 8ull } })); +} + TEST(AdaptiveCppUtility, get_device_list) { const auto &[queues, actual_target] = plssvm::adaptivecpp::detail::get_device_list(plssvm::target_platform::automatic); // at least one queue must be provided diff --git a/tests/backends/SYCL/DPCPP/detail/utility.cpp b/tests/backends/SYCL/DPCPP/detail/utility.cpp index ca6eaa713..84b2d60f9 100644 --- a/tests/backends/SYCL/DPCPP/detail/utility.cpp +++ b/tests/backends/SYCL/DPCPP/detail/utility.cpp @@ -10,10 +10,11 @@ #include "plssvm/backends/SYCL/DPCPP/detail/utility.hpp" -#include "plssvm/backends/execution_range.hpp" // plssvm::detail::dim_type -#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/backends/execution_range.hpp" // plssvm::detail::dim_type +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform -#include "sycl/sycl.hpp" // sycl::range +#include "sycl/sycl.hpp" // sycl::range, sycl::nd_range #include "gtest/gtest.h" // TEST, EXPECT_NE, EXPECT_FALSE @@ -59,6 +60,39 @@ TEST(DPCPPUtility, dim_type_to_native_3) { EXPECT_EQ(native_dim[2], dim.x); } +TEST(DPCPPUtility, get_execution_range_basic) { + // create a grid + const plssvm::detail::dim_type grid{ 64ull, 64ull }; + const plssvm::detail::dim_type block{ 8ull, 8ull }; + + // calculate the SYCL execution range + const ::sycl::range exec = plssvm::dpcpp::detail::get_execution_range(grid, block); + + EXPECT_EQ(exec, (sycl::range<2>{ 512ull, 512ull })); +} + +TEST(DPCPPUtility, get_execution_range_work_group) { + // create a grid + const plssvm::detail::dim_type grid{ 64ull, 64ull }; + const plssvm::detail::dim_type block{ 8ull, 8ull }; + + // calculate the SYCL execution range + const ::sycl::nd_range exec = plssvm::dpcpp::detail::get_execution_range(grid, block); + + EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 512ull, 512ull }, ::sycl::range<2>{ 8ull, 8ull } })); +} + +TEST(DPCPPUtility, get_execution_range_hierarchical) { + // create a grid + const plssvm::detail::dim_type grid{ 64ull, 64ull }; + const plssvm::detail::dim_type block{ 8ull, 8ull }; + + // calculate the SYCL execution range + const ::sycl::nd_range exec = plssvm::dpcpp::detail::get_execution_range(grid, block); + + EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 64ull, 64ull }, ::sycl::range<2>{ 8ull, 8ull } })); +} + TEST(DPCPPUtility, get_device_list) { const auto &[queues, actual_target] = plssvm::dpcpp::detail::get_device_list(plssvm::target_platform::automatic); // at least one queue must be provided @@ -67,12 +101,12 @@ TEST(DPCPPUtility, get_device_list) { EXPECT_NE(actual_target, plssvm::target_platform::automatic); } -TEST(AdaptiveCppUtility, get_dpcpp_version) { +TEST(DPCPPUtility, get_dpcpp_version) { const std::regex reg{ "[0-9]+\\.[0-9]+\\.[0-9]+", std::regex::extended }; EXPECT_TRUE(std::regex_match(plssvm::dpcpp::detail::get_dpcpp_version(), reg)); } -TEST(AdaptiveCppUtility, get_dpcpp_timestamp_version) { +TEST(DPCPPUtility, get_dpcpp_timestamp_version) { const std::string version = plssvm::dpcpp::detail::get_dpcpp_timestamp_version(); EXPECT_FALSE(version.empty()); } diff --git a/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp b/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp index b8ca19bf8..ada2f4b56 100644 --- a/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp +++ b/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp @@ -14,7 +14,7 @@ #include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/detail/arithmetic_type_name.hpp" // plssvm::detail::arithmetic_type_name #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type -#include "plssvm/parameter.hpp" // plssvm::parameter, plssvm::kernel_type, plssvm::cost +#include "plssvm/parameter.hpp" // plssvm::parameter, plssvm::kernel_type, plssvm::cost, plssvm::sycl_kernel_invocation_type #include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "tests/backends/generic_base_csvc_tests.hpp" // generic C-SVC tests to instantiate @@ -49,7 +49,7 @@ TYPED_TEST(DPCPPCSVMConstructor, default_construct) { // default constructor must always work EXPECT_NO_THROW(csvm_type{}); - EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); } TYPED_TEST(DPCPPCSVMConstructor, construct_parameter) { @@ -57,7 +57,7 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_parameter) { // the automatic target platform must always be available EXPECT_NO_THROW(csvm_type{ plssvm::parameter{} }); - EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); } TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_parameter) { @@ -69,33 +69,33 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_parameter) { // every target is allowed for SYCL #if defined(PLSSVM_HAS_CPU_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else - EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::dpcpp::backend_exception, "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif #if defined(PLSSVM_HAS_NVIDIA_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else - EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::dpcpp::backend_exception, "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif #if defined(PLSSVM_HAS_AMD_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else - EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::dpcpp::backend_exception, "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif #if defined(PLSSVM_HAS_INTEL_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else - EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::dpcpp::backend_exception, "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif @@ -107,7 +107,7 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_named_args) { // every target is allowed for SYCL EXPECT_NO_THROW((csvm_type{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); EXPECT_NO_THROW((csvm_type{ plssvm::cost = 2.0 })); - EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); } TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_named_args) { @@ -117,48 +117,48 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_named_args) { #if defined(PLSSVM_HAS_CPU_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::cost = 2.0 })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, - plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::dpcpp::backend_exception, "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif #if defined(PLSSVM_HAS_NVIDIA_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::cost = 2.0 })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, - plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::dpcpp::backend_exception, "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif #if defined(PLSSVM_HAS_AMD_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::cost = 2.0 })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, - plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::dpcpp::backend_exception, "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif #if defined(PLSSVM_HAS_INTEL_TARGET) EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::cost = 2.0 })); - EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range })); + EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group })); #else EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, - plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }), + plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }), plssvm::dpcpp::backend_exception, "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); #endif @@ -174,18 +174,23 @@ TYPED_TEST(DPCPPCSVMConstructor, get_kernel_invocation_type) { EXPECT_NE(svm.get_kernel_invocation_type(), plssvm::sycl::kernel_invocation_type::automatic); } -template +template struct dpcpp_csvm_test_type { using mock_csvm_type = mock_dpcpp_csvm; using csvm_type = plssvm::dpcpp::csvm; using csvc_type = plssvm::dpcpp::csvc; using csvr_type = plssvm::dpcpp::csvr; using device_ptr_type = typename csvm_type::device_ptr_type; - inline static auto additional_arguments = std::make_tuple(); + inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::sycl_kernel_invocation_type, invocation_type)); }; // a tuple containing the test structs -using dpcpp_csvm_test_tuple = std::tuple>; +using dpcpp_csvm_test_tuple = std::tuple< +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + dpcpp_csvm_test_type, +#endif + dpcpp_csvm_test_type, + dpcpp_csvm_test_type>; // the tests used in the instantiated GTest test suites // general test types @@ -231,7 +236,13 @@ INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPCSVM, GenericGPUCSVMKernelFunction, dpcpp_ke // generic GPU C-SVM DeathTests - correct grid sizes INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPCSVMDeathTest, GenericGPUCSVMDeathTest, dpcpp_csvm_test_type_gtest, naming::test_parameter_to_name); -using dpcpp_mock_csvm_test_tuple = std::tuple>; +using dpcpp_mock_csvm_test_tuple = std::tuple< +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + dpcpp_csvm_test_type, +#endif + dpcpp_csvm_test_type, + dpcpp_csvm_test_type>; + using dpcpp_mock_csvm_test_type_list = util::cartesian_type_product_t; using dpcpp_mock_csvm_test_type_gtest = util::combine_test_parameters_gtest_t; diff --git a/tests/backends/SYCL/implementation_types.cpp b/tests/backends/SYCL/implementation_types.cpp index 9f0819cc3..f7f31b45d 100644 --- a/tests/backends/SYCL/implementation_types.cpp +++ b/tests/backends/SYCL/implementation_types.cpp @@ -12,7 +12,8 @@ #include "tests/custom_test_macros.hpp" // EXPECT_CONVERSION_TO_STRING, EXPECT_CONVERSION_FROM_STRING -#include "gtest/gtest.h" // TEST, EXPECT_TRUE +#include "gmock/gmock.h" // EXPECT_THAT; ::testing::Contains +#include "gtest/gtest.h" // TEST, EXPECT_TRUE, EXPECT_GE #include // std::istringstream diff --git a/tests/backends/SYCL/kernel_invocation_types.cpp b/tests/backends/SYCL/kernel_invocation_types.cpp index bcb2034b6..3227cb077 100644 --- a/tests/backends/SYCL/kernel_invocation_types.cpp +++ b/tests/backends/SYCL/kernel_invocation_types.cpp @@ -12,7 +12,8 @@ #include "tests/custom_test_macros.hpp" // EXPECT_CONVERSION_TO_STRING, EXPECT_CONVERSION_FROM_STRING -#include "gtest/gtest.h" // TEST, EXPECT_TRUE +#include "gmock/gmock.h" // EXPECT_THAT; ::testing::Contains +#include "gtest/gtest.h" // TEST, EXPECT_TRUE, EXPECT_GE #include // std::istringstream @@ -20,12 +21,15 @@ TEST(SYCLKernelInvocationType, to_string) { // check conversions to std::string EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::automatic, "automatic"); - EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::nd_range, "nd_range"); + EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::basic, "basic"); + EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::work_group, "work_group"); + EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::hierarchical, "hierarchical"); + EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::scoped, "scoped"); } TEST(SYCLKernelInvocationType, to_string_unknown) { // check conversions to std::string from unknown file_format_type - EXPECT_CONVERSION_TO_STRING(static_cast(3), "unknown"); + EXPECT_CONVERSION_TO_STRING(static_cast(5), "unknown"); } // check whether the std::string -> plssvm::sycl::kernel_invocation_type conversions are correct @@ -35,8 +39,16 @@ TEST(SYCLKernelInvocationType, from_string) { EXPECT_CONVERSION_FROM_STRING("AUTOMATIC", plssvm::sycl::kernel_invocation_type::automatic); EXPECT_CONVERSION_FROM_STRING("auto", plssvm::sycl::kernel_invocation_type::automatic); EXPECT_CONVERSION_FROM_STRING("AUTO", plssvm::sycl::kernel_invocation_type::automatic); - EXPECT_CONVERSION_FROM_STRING("nd_range", plssvm::sycl::kernel_invocation_type::nd_range); - EXPECT_CONVERSION_FROM_STRING("ND_RANGE", plssvm::sycl::kernel_invocation_type::nd_range); + EXPECT_CONVERSION_FROM_STRING("basic", plssvm::sycl::kernel_invocation_type::basic); + EXPECT_CONVERSION_FROM_STRING("BASIC", plssvm::sycl::kernel_invocation_type::basic); + EXPECT_CONVERSION_FROM_STRING("work_group", plssvm::sycl::kernel_invocation_type::work_group); + EXPECT_CONVERSION_FROM_STRING("WORK-GROUP", plssvm::sycl::kernel_invocation_type::work_group); + EXPECT_CONVERSION_FROM_STRING("nd_range", plssvm::sycl::kernel_invocation_type::work_group); + EXPECT_CONVERSION_FROM_STRING("ND-RANGE", plssvm::sycl::kernel_invocation_type::work_group); + EXPECT_CONVERSION_FROM_STRING("hierarchical", plssvm::sycl::kernel_invocation_type::hierarchical); + EXPECT_CONVERSION_FROM_STRING("HIERARCHICAL", plssvm::sycl::kernel_invocation_type::hierarchical); + EXPECT_CONVERSION_FROM_STRING("scoped", plssvm::sycl::kernel_invocation_type::scoped); + EXPECT_CONVERSION_FROM_STRING("SCOPED", plssvm::sycl::kernel_invocation_type::scoped); } TEST(SYCLKernelInvocationType, from_string_unknown) { @@ -46,3 +58,15 @@ TEST(SYCLKernelInvocationType, from_string_unknown) { input >> invocation_type; EXPECT_TRUE(input.fail()); } + +TEST(SYCLKernelInvocationType, minimal_available_sycl_kernel_invocation_types) { + const std::vector invocation_type = plssvm::sycl::list_available_sycl_kernel_invocation_types(); + + // at least three must be available (automatic, basic, and work_group)! + EXPECT_GE(invocation_type.size(), 3); + + // check for the kernel invocation types that must always be present + EXPECT_THAT(invocation_type, ::testing::Contains(plssvm::sycl::kernel_invocation_type::automatic)); + EXPECT_THAT(invocation_type, ::testing::Contains(plssvm::sycl::kernel_invocation_type::basic)); + EXPECT_THAT(invocation_type, ::testing::Contains(plssvm::sycl::kernel_invocation_type::work_group)); +} diff --git a/tests/detail/cmd/parser_predict.cpp b/tests/detail/cmd/parser_predict.cpp index f8ee46ed3..8365494b7 100644 --- a/tests/detail/cmd/parser_predict.cpp +++ b/tests/detail/cmd/parser_predict.cpp @@ -10,13 +10,14 @@ #include "plssvm/detail/cmd/parser_predict.hpp" -#include "plssvm/backend_types.hpp" // plssvm::backend_type -#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space -#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type -#include "plssvm/constants.hpp" // plssvm::real_type -#include "plssvm/exceptions/exceptions.hpp" // plssvm::cmd_parser_exit -#include "plssvm/target_platforms.hpp" // plssvm::target_platform -#include "plssvm/verbosity_levels.hpp" // plssvm::verbosity +#include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/constants.hpp" // plssvm::real_type +#include "plssvm/exceptions/exceptions.hpp" // plssvm::cmd_parser_exit +#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/verbosity_levels.hpp" // plssvm::verbosity #include "tests/custom_test_macros.hpp" // EXPECT_CONVERSION_TO_STRING, EXPECT_THROW_WHAT #include "tests/detail/cmd/cmd_utility.hpp" // util::ParameterBase @@ -47,7 +48,9 @@ TEST_F(ParserPredict, minimal) { // check parsed values EXPECT_EQ(parser.backend, plssvm::backend_type::automatic); EXPECT_EQ(parser.target, plssvm::target_platform::automatic); + EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic); EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic); + EXPECT_EQ(parser.kokkos_execution_space, plssvm::kokkos::execution_space::automatic); EXPECT_FALSE(parser.strings_as_labels); EXPECT_EQ(parser.input_filename, "data.libsvm"); EXPECT_EQ(parser.model_filename, "data.libsvm.model"); @@ -69,6 +72,7 @@ TEST_F(ParserPredict, minimal_output) { "backend: automatic\n" "target platform: automatic\n" "SYCL implementation type: automatic\n" + "SYCL kernel invocation type: automatic\n" "Kokkos execution space: automatic\n" "label_type: int (default)\n" "real_type: {}\n" @@ -86,7 +90,7 @@ TEST_F(ParserPredict, all_arguments) { // create artificial command line arguments in test fixture std::vector cmd_args = { "./plssvm-predict", "--backend", "cuda", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" }; #if defined(PLSSVM_HAS_SYCL_BACKEND) - cmd_args.insert(cmd_args.end(), { "--sycl_implementation_type", "dpcpp" }); + cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" }); #endif #if defined(PLSSVM_HAS_KOKKOS_BACKEND) const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1]; // [0] would be automatic @@ -108,8 +112,10 @@ TEST_F(ParserPredict, all_arguments) { EXPECT_EQ(parser.backend, plssvm::backend_type::cuda); EXPECT_EQ(parser.target, plssvm::target_platform::gpu_nvidia); #if defined(PLSSVM_HAS_SYCL_BACKEND) + EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::work_group); EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::dpcpp); #else + EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic); EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic); #endif #if defined(PLSSVM_HAS_KOKKOS_BACKEND) @@ -137,7 +143,7 @@ TEST_F(ParserPredict, all_arguments_output) { // create artificial command line arguments in test fixture std::vector cmd_args = { "./plssvm-predict", "--backend", "automatic", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" }; #if defined(PLSSVM_HAS_SYCL_BACKEND) - cmd_args.insert(cmd_args.end(), { "--sycl_implementation_type", "dpcpp" }); + cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" }); #endif #if defined(PLSSVM_HAS_KOKKOS_BACKEND) const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1]; // [0] would be automatic @@ -161,9 +167,11 @@ TEST_F(ParserPredict, all_arguments_output) { "target platform: gpu_nvidia\n" }; #if defined(PLSSVM_HAS_SYCL_BACKEND) - correct += "SYCL implementation type: dpcpp\n"; + correct += "SYCL implementation type: dpcpp\n" + "SYCL kernel invocation type: work_group\n"; #else - correct += "SYCL implementation type: automatic\n"; + correct += "SYCL implementation type: automatic\n" + "SYCL kernel invocation type: automatic\n"; #endif #if defined(PLSSVM_HAS_KOKKOS_BACKEND) correct += fmt::format("Kokkos execution space: {}\n", space); @@ -236,6 +244,28 @@ INSTANTIATE_TEST_SUITE_P(ParserPredict, ParserPredictTargetPlatform, ::testing:: #if defined(PLSSVM_HAS_SYCL_BACKEND) +class ParserPredictSYCLKernelInvocation : public ParserPredict, + public ::testing::WithParamInterface> { }; + +TEST_P(ParserPredictSYCLKernelInvocation, parsing) { + const auto &[flag, value] = GetParam(); + // convert string to sycl::kernel_invocation_type + const auto sycl_kernel_invocation_type = util::convert_from_string(value); + // create artificial command line arguments in test fixture + this->CreateCMDArgs({ "./plssvm-predict", flag, value, "data.libsvm" }); + // create parameter object + const plssvm::detail::cmd::parser_predict parser{ this->get_comm(), this->get_argc(), this->get_argv() }; + // test for correctness + EXPECT_EQ(parser.sycl_kernel_invocation_type, sycl_kernel_invocation_type); +} + +// clang-format off +INSTANTIATE_TEST_SUITE_P(ParserPredict, ParserPredictSYCLKernelInvocation, ::testing::Combine( + ::testing::Values("--sycl_kernel_invocation_type"), + ::testing::Values("automatic", "auto", "basic", "nd_range", "work_group", "hierarchical", "scoped")), + naming::pretty_print_parameter_flag_and_value); +// clang-format on + class ParserPredictSYCLImplementation : public ParserPredict, public ::testing::WithParamInterface> { }; diff --git a/tests/detail/cmd/parser_train.cpp b/tests/detail/cmd/parser_train.cpp index 94337f90b..36e70228c 100644 --- a/tests/detail/cmd/parser_train.cpp +++ b/tests/detail/cmd/parser_train.cpp @@ -63,6 +63,7 @@ TEST_F(ParserTrain, minimal) { EXPECT_EQ(parser.solver, plssvm::solver_type::automatic); EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic); EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic); + EXPECT_EQ(parser.kokkos_execution_space, plssvm::kokkos::execution_space::automatic); EXPECT_FALSE(parser.strings_as_labels); EXPECT_EQ(parser.input_filename, "data.libsvm"); EXPECT_EQ(parser.model_filename, "data.libsvm.model"); @@ -107,7 +108,7 @@ TEST_F(ParserTrain, all_arguments) { // create artificial command line arguments in test fixture std::vector cmd_args = { "./plssvm-train", "--svm_type", "1", "--kernel_type", "1", "--degree", "2", "--gamma", "1.5", "--coef0", "-1.5", "--cost", "2", "--epsilon", "1e-12", "--max_iter", "100", "--classification", "oao", "--solver", "cg_implicit", "--backend", "cuda", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" }; #if defined(PLSSVM_HAS_SYCL_BACKEND) - cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "nd_range", "--sycl_implementation_type", "dpcpp" }); + cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" }); #endif #if defined(PLSSVM_HAS_KOKKOS_BACKEND) const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1]; // [0] would be automatic @@ -141,7 +142,7 @@ TEST_F(ParserTrain, all_arguments) { EXPECT_EQ(parser.target, plssvm::target_platform::gpu_nvidia); EXPECT_EQ(parser.solver, plssvm::solver_type::cg_implicit); #if defined(PLSSVM_HAS_SYCL_BACKEND) - EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::nd_range); + EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::work_group); EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::dpcpp); #else EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic); @@ -169,7 +170,7 @@ TEST_F(ParserTrain, all_arguments_output) { // create artificial command line arguments in test fixture std::vector cmd_args = { "./plssvm-train", "--svm_type", "1", "--kernel_type", "1", "--degree", "2", "--gamma", "1.5", "--coef0", "-1.5", "--cost", "2", "--epsilon", "1e-12", "--max_iter", "100", "--classification", "oao", "--solver", "cg_implicit", "--backend", "automatic", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" }; #if defined(PLSSVM_HAS_SYCL_BACKEND) - cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "nd_range", "--sycl_implementation_type", "dpcpp" }); + cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" }); #endif #if defined(PLSSVM_HAS_KOKKOS_BACKEND) const std::string space = fmt::format("{}", plssvm::kokkos::list_available_execution_spaces()[1]); // [0] would be automatic @@ -202,7 +203,7 @@ TEST_F(ParserTrain, all_arguments_output) { "solver: cg_implicit\n"; #if defined(PLSSVM_HAS_SYCL_BACKEND) correct += "SYCL implementation type: dpcpp\n" - "SYCL kernel invocation type: nd_range\n"; + "SYCL kernel invocation type: work_group\n"; #else correct += "SYCL implementation type: automatic\n" "SYCL kernel invocation type: automatic\n"; @@ -547,7 +548,7 @@ TEST_P(ParserTrainSYCLKernelInvocation, parsing) { // clang-format off INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserTrainSYCLKernelInvocation, ::testing::Combine( ::testing::Values("--sycl_kernel_invocation_type"), - ::testing::Values("automatic", "nd_range", "ND_RANGE")), + ::testing::Values("automatic", "auto", "basic", "nd_range", "work_group", "hierarchical", "scoped")), naming::pretty_print_parameter_flag_and_value); // clang-format on diff --git a/tests/parameter.cpp b/tests/parameter.cpp index 80de6eec7..940aa7d08 100644 --- a/tests/parameter.cpp +++ b/tests/parameter.cpp @@ -99,7 +99,7 @@ TEST(Parameter, construct_parameter_and_named_args) { const plssvm::parameter param{ param_base, plssvm::kernel_type = plssvm::kernel_function_type::rbf, plssvm::sycl_implementation_type = plssvm::sycl::implementation_type::adaptivecpp, - plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range, + plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }; // test default values diff --git a/utility_scripts/performance_analysis.py b/utility_scripts/performance_analysis.py index 675fb5202..48a5cb179 100644 --- a/utility_scripts/performance_analysis.py +++ b/utility_scripts/performance_analysis.py @@ -120,7 +120,7 @@ def fit_model_with_timeout(csvm, data, eps): if sycl_impl == plssvm.sycl.ImplementationType.AUTOMATIC: continue available_backends.append((backend, { "sycl_implementation_type": sycl_impl, - "sycl_kernel_invocation_type": plssvm.sycl.KernelInvocationType.ND_RANGE })) + "sycl_kernel_invocation_type": plssvm.sycl.KernelInvocationType.WORK_GROUP })) else: available_backends.append((backend, { }))