diff --git a/CMakeLists.txt b/CMakeLists.txt
index d43c1f825..4477cf212 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -903,11 +903,12 @@ if (TARGET ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME})
 choose the SYCL implementation to be used in the SYCL backend: ${PLSSVM_SYCL_BACKEND_NAME_LIST} (default: automatic)
 "
     )
+    string(REPLACE ";" "|" PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST "${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST}")
     set(PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_MANPAGE_ENTRY
         "
 .TP
 .B --sycl_kernel_invocation_type
-choose the kernel invocation type when using SYCL as backend: automatic|nd_range (default: automatic)
+choose the kernel invocation type when using SYCL as backend: ${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST} (default: automatic)
 "
     )
 endif ()
@@ -936,8 +937,6 @@ endif ()
 
 # configure the manpage files
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-train.1.in ${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-train.1 @ONLY)
-# update manpage entry since plssvm-predict can't recognize the SYCL kernel invocation type
-set(PLSSVM_SYCL_MANPAGE_ENTRY "${PLSSVM_SYCL_IMPLEMENTATION_TYPE_MANPAGE_ENTRY}")
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-predict.1.in ${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-predict.1 @ONLY)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-scale.1.in ${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-scale.1 @ONLY)
 
diff --git a/README.md b/README.md
index 797ba4133..5fb8ebe31 100644
--- a/README.md
+++ b/README.md
@@ -346,6 +346,8 @@ If the SYCL backend is available, additional options can be set.
   - `AUTO`: check for DPC++/icpx as implementation for the SYCL backend but **do not** fail if not available
   - `OFF`: do not check for DPC++/icpx as implementation for the SYCL backend
 
+- `PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS` (default: `ON`): enable SYCL's `hierarchical` and AdaptiveCpp's `scoped` kernel invocation types
+
 To use DPC++/icpx for SYCL, simply set the `CMAKE_CXX_COMPILER` to the respective DPC++/icpx clang executable during CMake invocation.
 
 If the SYCL implementation is DPC++/icpx the following additional options are available:
@@ -684,7 +686,7 @@ Usage:
   -b, --backend arg             choose the backend: automatic|openmp|hpx|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic)
   -p, --target_platform arg     choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic)
       --sycl_kernel_invocation_type arg
-                                choose the kernel invocation type when using SYCL as backend: automatic|nd_range (default: automatic)
+                                choose the kernel invocation type when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic)
       --sycl_implementation_type arg
                                 choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic)
       --kokkos_execution_space arg
@@ -745,7 +747,7 @@ The `--target_platform=automatic` option works for the different backends as fol
 - `stdpar`: target device must be selected at compile time (using `PLSSVM_TARGET_PLATFORMS`) or using environment variables at runtime
 
 The `--sycl_kernel_invocation_type` and `--sycl_implementation_type` flags are only used if the `--backend` is `sycl`, otherwise a warning is emitted on `stderr`.
-If the `--sycl_kernel_invocation_type` is `automatic`, the `nd_range` invocation type is currently always used.
+If the `--sycl_kernel_invocation_type` is `automatic`, the `work_group` invocation type is currently always used.
 If the `--sycl_implementation_type` is `automatic`, the used SYCL implementation is determined by the `PLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` CMake flag.
 If the `--kokkos_execution_space` is `automatic`, uses the best fitting execution space based on the provided and/or available target platforms.
 
@@ -793,6 +795,8 @@ Usage:
 
   -b, --backend arg             choose the backend: automatic|openmp|hpx|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic)
   -p, --target_platform arg     choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic)
+      --sycl_kernel_invocation_type arg
+                                choose the kernel invocation type when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic)
       --sycl_implementation_type arg
                                 choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic)
       --kokkos_execution_space arg
diff --git a/bindings/Python/README.md b/bindings/Python/README.md
index 196dbfc7b..504d2533b 100644
--- a/bindings/Python/README.md
+++ b/bindings/Python/README.md
@@ -332,10 +332,10 @@ The following table lists all PLSSVM enumerations exposed on the Python side:
 
 If a SYCL implementation is available, additional enumerations are available:
 
-| enumeration            | values                              | description                                                                                                                                                                                                                                               |
-|------------------------|-------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `ImplementationType`   | `AUTOMATIC`, `DPCPP`, `ADAPTIVECPP` | The different supported SYCL implementation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, determines the used SYCL implementation based on the value of `-DPLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` provided during PLSSVM'S build step. |
-| `KernelInvocationType` | `AUTOMATIC`, `ND_RANGE`             | The different supported SYCL kernel invocation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, simply uses `ND_RANGE` (only implemented to be able to add new invocation types in the future).                                                  |
+| enumeration            | values                                                       | description                                                                                                                                                                                                                                               |
+|------------------------|--------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `ImplementationType`   | `AUTOMATIC`, `DPCPP`, `ADAPTIVECPP`                          | The different supported SYCL implementation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, determines the used SYCL implementation based on the value of `-DPLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` provided during PLSSVM'S build step. |
+| `KernelInvocationType` | `AUTOMATIC`, `BASIC`, `WORK_GROUP`, `HIERARCHICAL`, `SCOPED` | The different supported SYCL kernel invocation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, simply uses `WORK_GROUP`.                                                                                                                        |
 
 If the stdpar backend is available, an additional enumeration is available:
 
diff --git a/bindings/Python/backends/sycl.cpp b/bindings/Python/backends/sycl.cpp
index f2cc924d6..98c27214b 100644
--- a/bindings/Python/backends/sycl.cpp
+++ b/bindings/Python/backends/sycl.cpp
@@ -48,7 +48,10 @@ void init_sycl(py::module_ &m, const py::exception<plssvm::exception> &base_exce
     py::enum_<plssvm::sycl::kernel_invocation_type> py_enum_invocation(sycl_module, "KernelInvocationType", "Enum class for all possible SYCL kernel invocation types supported in PLSSVM.");
     py_enum_invocation
         .value("AUTOMATIC", plssvm::sycl::kernel_invocation_type::automatic, "use the best kernel invocation type for the current SYCL implementation and target hardware platform")
-        .value("ND_RANGE", plssvm::sycl::kernel_invocation_type::nd_range, "use the nd_range kernel invocation type");
+        .value("BASIC", plssvm::sycl::kernel_invocation_type::basic, "use the basic data parallel kernel invocation type")
+        .value("WORK_GROUP", plssvm::sycl::kernel_invocation_type::work_group, "use the work-group data parallel kernel invocation type")
+        .value("HIERARCHICAL", plssvm::sycl::kernel_invocation_type::hierarchical, "use the hierarchical data parallel kernel invocation type")
+        .value("SCOPED", plssvm::sycl::kernel_invocation_type::scoped, "use the AdaptiveCpp specific scoped parallelism kernel invocation type");
 
     // enable implicit conversion from string to enum
     plssvm::bindings::python::util::register_implicit_str_enum_conversion<plssvm::sycl::kernel_invocation_type>(py_enum_invocation);
diff --git a/docs/resources/dirs.dox b/docs/resources/dirs.dox
index dcb4337d0..412fe72b1 100644
--- a/docs/resources/dirs.dox
+++ b/docs/resources/dirs.dox
@@ -18,7 +18,7 @@
  * @license This file is part of the PLSSVM project which is released under the MIT license.
  *          See the LICENSE.md file in the project root for full license information.
  *
- * @brief Directory containing the implementation of all four available backends: OpenMP, CUDA, OpenCL, and SYCL.
+ * @brief Directory containing the implementation of all available backends.
  */
 
 /**
@@ -488,6 +488,50 @@
  * @brief Directory containing kernel implementations for the explicit CG algorithm using the SYCL backend.
  */
 
+/**
+ * @dir include/plssvm/backends/SYCL/kernel/cg_explicit/basic
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing basic data parallel kernel implementations for the explicit CG algorithm using the SYCL backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing hierarchical kernel implementations for the explicit CG algorithm using the SYCL backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/SYCL/kernel/cg_explicit/scoped
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing scoped-parallelism kernel implementations for the explicit CG algorithm using the SYCL backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/SYCL/kernel/cg_explicit/work_group
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing work-group data parallel kernel implementations for the explicit CG algorithm using the SYCL backend.
+ */
+
 /**
  * @dir include/plssvm/backends/SYCL/kernel/cg_implicit
  * @author Alexander Van Craen
@@ -499,6 +543,105 @@
  * @brief Directory containing kernel implementations for the implicit CG algorithm using the SYCL backend.
  */
 
+/**
+ * @dir include/plssvm/backends/SYCL/kernel/cg_implicit/basic
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing basic data parallel kernel implementations for the implicit CG algorithm using the SYCL backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing hierarchical kernel implementations for the implicit CG algorithm using the SYCL backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/SYCL/kernel/cg_implicit/scoped
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing scoped-parallelism kernel implementations for the implicit CG algorithm using the SYCL backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/SYCL/kernel/cg_implicit/work_group
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing work-group data parallel kernel implementations for the implicit CG algorithm using the SYCL backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/SYCL/kernel/predict
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing kernel implementations for the predictions using the SYCL backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/SYCL/kernel/predict/basic
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing basic data parallel kernel implementations for the predictions using the SYCL backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/SYCL/kernel/predict/hierarchical
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing hierarchical kernel implementations for the predictions using the SYCL backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/SYCL/kernel/predict/scoped
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing scoped-parallelism kernel implementations for the predictions using the SYCL backend.
+ */
+
+/**
+ * @dir include/plssvm/backends/SYCL/kernel/predict/work_group
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Directory containing work-group data parallel kernel implementations for the predictions using the SYCL backend.
+ */
+
 /**
  * @dir include/plssvm/backends/SYCL/DPCPP
  * @author Alexander Van Craen
diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp
index 4fd639732..55b6a746b 100644
--- a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp
+++ b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp
@@ -23,6 +23,7 @@
 #include "plssvm/detail/igor_utility.hpp"                             // plssvm::detail::get_value_from_named_parameter
 #include "plssvm/detail/memory_size.hpp"                              // plssvm::detail::memory_size
 #include "plssvm/detail/type_traits.hpp"                              // PLSSVM_REQUIRES, plssvm::detail::is_one_type_of
+#include "plssvm/exceptions/exceptions.hpp"                           // plssvm::invalid_parameter_exception
 #include "plssvm/mpi/communicator.hpp"                                // plssvm::mpi::communicator
 #include "plssvm/parameter.hpp"                                       // plssvm::parameter, plssvm::detail::{has_only_sycl_parameter_named_args_v, has_only_sycl_named_args_v}
 #include "plssvm/svm/csvc.hpp"                                        // plssvm::csvc
@@ -76,6 +77,14 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
         if constexpr (parser.has(sycl_kernel_invocation_type)) {
             // compile time check: the value must have the correct type
             invocation_type_ = ::plssvm::detail::get_value_from_named_parameter<sycl::kernel_invocation_type>(parser, sycl_kernel_invocation_type);
+
+#if !defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+            if (invocation_type_ == sycl::kernel_invocation_type::hierarchical) {
+                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::hierarchical is disabled for the AdaptiveCpp SYCL backend!" };
+            } else if (invocation_type_ == sycl::kernel_invocation_type::scoped) {
+                throw ::plssvm::invalid_parameter_exception{ "he provided sycl::kernel_invocation_type::scoped is disabled for the AdaptiveCpp SYCL backend!" };
+            }
+#endif
         }
         this->init(target);
     }
diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp
index bb4ff90a6..23ffb1872 100644
--- a/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp
+++ b/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp
@@ -15,9 +15,11 @@
 
 #include "plssvm/backends/execution_range.hpp"                // plssvm::detail::dim_type
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue.hpp"  // plssvm::adaptivecpp::detail::queue (PImpl)
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"   // plssvm::sycl::kernel_invocation_type
+#include "plssvm/detail/utility.hpp"                          // plssvm::detail::unreachable
 #include "plssvm/target_platforms.hpp"                        // plssvm::target_platform
 
-#include "sycl/sycl.hpp"  // sycl::range
+#include "sycl/sycl.hpp"  // sycl::range, sycl::nd_range
 
 #include <string>   // std::string
 #include <utility>  // std::pair
@@ -46,6 +48,30 @@ template <std::size_t I>
     }
 }
 
+/**
+ * @brief Convert the provided @p grid and @p block to the final SYCL execution range.
+ * @tparam invocation_type the SYCL kernel invocation type
+ * @param[in] grid the execution grid
+ * @param[in] block the execution block
+ * @return the SYCL native execution range
+ */
+template <sycl::kernel_invocation_type invocation_type>
+auto get_execution_range(const ::plssvm::detail::dim_type &grid, const ::plssvm::detail::dim_type &block) {
+    const ::sycl::range native_grid = detail::dim_type_to_native<2>(grid);
+    const ::sycl::range native_block = detail::dim_type_to_native<2>(block);
+
+    if constexpr (invocation_type == sycl::kernel_invocation_type::basic) {
+        return ::sycl::range<2>{ native_grid * native_block };
+    } else if constexpr (invocation_type == sycl::kernel_invocation_type::work_group) {
+        return ::sycl::nd_range<2>{ native_grid * native_block, native_block };
+    } else if constexpr (invocation_type == sycl::kernel_invocation_type::hierarchical || invocation_type == sycl::kernel_invocation_type::scoped) {
+        return ::sycl::nd_range<2>{ native_grid, native_block };
+    } else {
+        // can't be reached
+        ::plssvm::detail::unreachable();
+    }
+}
+
 /**
  * @brief Returns the list devices matching the target platform @p target and the actually used target platform
  *        (only interesting if the provided @p target was automatic).
diff --git a/include/plssvm/backends/SYCL/DPCPP/csvm.hpp b/include/plssvm/backends/SYCL/DPCPP/csvm.hpp
index 4bcdc2da9..4b1a6b570 100644
--- a/include/plssvm/backends/SYCL/DPCPP/csvm.hpp
+++ b/include/plssvm/backends/SYCL/DPCPP/csvm.hpp
@@ -23,6 +23,7 @@
 #include "plssvm/detail/igor_utility.hpp"                       // plssvm::detail::get_value_from_named_parameter
 #include "plssvm/detail/memory_size.hpp"                        // plssvm::detail::memory_size
 #include "plssvm/detail/type_traits.hpp"                        // PLSSVM_REQUIRES, plssvm::detail::is_one_type_of
+#include "plssvm/exceptions/exceptions.hpp"                     // plssvm::invalid_parameter_exception
 #include "plssvm/mpi/communicator.hpp"                          // plssvm::mpi::communicator
 #include "plssvm/parameter.hpp"                                 // plssvm::parameter, plssvm::detail::{has_only_sycl_parameter_named_args_v, has_only_sycl_named_args_v}
 #include "plssvm/svm/csvc.hpp"                                  // plssvm::csvc
@@ -63,6 +64,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
      * @param[in] target the target platform used for this C-SVM
      * @param[in] named_args the additional optional named arguments
      * @throws plssvm::exception all exceptions thrown in the base class constructor
+     * @throws plssvm::invalid_parameter_exception the provided SYCL kernel invocation type is "scoped"
      * @throws plssvm::dpcpp::backend_exception if the requested target is not available
      * @throws plssvm::dpcpp::backend_exception if no device for the requested target was found
      */
@@ -75,6 +77,16 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
         if constexpr (parser.has(sycl_kernel_invocation_type)) {
             // compile time check: the value must have the correct type
             invocation_type_ = ::plssvm::detail::get_value_from_named_parameter<sycl::kernel_invocation_type>(parser, sycl_kernel_invocation_type);
+            // the invocation type "scoped" isn't supported by DPC++
+            if (invocation_type_ == sycl::kernel_invocation_type::scoped) {
+                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::scoped isn't supported by DPC++!" };
+            }
+
+#if !defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+            if (invocation_type_ == sycl::kernel_invocation_type::hierarchical) {
+                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::hierarchical is disabled for the DPC++ SYCL backend!" };
+            }
+#endif
         }
         this->init(target);
     }
diff --git a/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp b/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp
index 327cf1ac7..d61a73407 100644
--- a/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp
+++ b/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp
@@ -13,11 +13,13 @@
 #define PLSSVM_BACKENDS_SYCL_DPCPP_DETAIL_UTILITY_HPP_
 #pragma once
 
-#include "plssvm/backends/execution_range.hpp"          // plssvm::detail::dim_type
-#include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp"  // plssvm::dpcpp::detail::queue (PImpl)
-#include "plssvm/target_platforms.hpp"                  // plssvm::target_platform
+#include "plssvm/backends/execution_range.hpp"               // plssvm::detail::dim_type
+#include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp"       // plssvm::dpcpp::detail::queue (PImpl)
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/detail/utility.hpp"                         // plssvm::detail::unreachable
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
-#include "sycl/sycl.hpp"  // sycl::range
+#include "sycl/sycl.hpp"  // sycl::range, sycl::nd_range
 
 #include <cstddef>  // std::size_t
 #include <string>   // std::string
@@ -47,6 +49,30 @@ template <std::size_t I>
     }
 }
 
+/**
+ * @brief Convert the provided @p grid and @p block to the final SYCL execution range.
+ * @tparam invocation_type the SYCL kernel invocation type
+ * @param[in] grid the execution grid
+ * @param[in] block the execution block
+ * @return the SYCL native execution range
+ */
+template <sycl::kernel_invocation_type invocation_type>
+auto get_execution_range(const ::plssvm::detail::dim_type &grid, const ::plssvm::detail::dim_type &block) {
+    const ::sycl::range native_grid = detail::dim_type_to_native<2>(grid);
+    const ::sycl::range native_block = detail::dim_type_to_native<2>(block);
+
+    if constexpr (invocation_type == sycl::kernel_invocation_type::basic) {
+        return ::sycl::range<2>{ native_grid * native_block };
+    } else if constexpr (invocation_type == sycl::kernel_invocation_type::work_group) {
+        return ::sycl::nd_range<2>{ native_grid * native_block, native_block };
+    } else if constexpr (invocation_type == sycl::kernel_invocation_type::hierarchical) {
+        return ::sycl::nd_range<2>{ native_grid, native_block };
+    } else {
+        // can't be reached
+        ::plssvm::detail::unreachable();
+    }
+}
+
 /**
  * @brief Returns the list devices matching the target platform @p target and the actually used target platform
  *        (only interesting if the provided @p target was automatic).
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
new file mode 100644
index 000000000..2e528149c
--- /dev/null
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
@@ -0,0 +1,330 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Functions for explicitly performing a BLAS GEMM like matrix-matrix multiplication using the SYCL backend and the basic data parallel kernels.
+ */
+
+#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_
+#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_
+#pragma once
+
+#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+
+#include "sycl/sycl.hpp"  // sycl::item
+
+#include <cstddef>  // std::size_t
+
+namespace plssvm::sycl::detail::basic {
+
+/**
+ * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
+ * @details Uses SYCL's basic data parallel kernels.
+ */
+class device_kernel_symm {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] num_rows the number of rows in @p A and @p C
+     * @param[in] num_rhs the number of columns in @p B and @p C
+     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] alpha the scalar alpha value
+     * @param[in] A the matrix @p A
+     * @param[in] B the matrix @p B
+     * @param[in] beta the scalar beta value
+     * @param[in,out] C the matrix @p C, also used as result matrix
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        num_rows_{ num_rows },
+        num_rhs_{ num_rhs },
+        device_specific_num_rows_{ device_specific_num_rows },
+        row_offset_{ row_offset },
+        alpha_{ alpha },
+        A_{ A },
+        B_{ B },
+        beta_{ beta },
+        C_{ C },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] idx indices representing the current point in the execution space
+     */
+    void operator()(::sycl::item<2> idx) const {
+        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // calculate the indices used in the current work-item
+        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+
+        // create a work-item private array used for internal caching
+        real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
+
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (unsigned long long dim = 0; dim < (num_rows_ - row_offset_); ++dim) {
+            // perform the dot product calculation
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    const auto global_i = i + static_cast<std::size_t>(internal_i);
+                    const auto global_j = j + static_cast<std::size_t>(internal_j);
+
+                    real_type A_val = 0.0;
+                    // determine on which side of the diagonal we are located
+                    if (dim < global_j) {
+                        A_val = A_[dim * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }];
+                    } else {
+                        A_val = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                    }
+
+                    temp[internal_i][internal_j] += A_val * B_[(dim + row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                }
+            }
+        }
+
+        // apply the (partial) BLAS operation and update C
+        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                const auto global_i = i + static_cast<std::size_t>(internal_i);
+                const auto device_global_j = j + static_cast<std::size_t>(internal_j);
+                const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+
+                // be sure to not perform out of bounds accesses
+                if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) {
+                    C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                }
+            }
+        }
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_rows_;
+    const std::size_t num_rhs_;
+    const std::size_t device_specific_num_rows_;
+    const std::size_t row_offset_;
+    const real_type alpha_;
+    const real_type *A_;
+    const real_type *B_;
+    const real_type beta_;
+    real_type *C_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
+ * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
+ *          Uses SYCL's basic data parallel kernels.
+ */
+class device_kernel_symm_mirror {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] num_rows the number of rows in @p A and @p C
+     * @param[in] num_rhs the number of columns in @p B and @p C
+     * @param[in] num_mirror_rows the number of rows to mirror down
+     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] alpha the scalar alpha value
+     * @param[in] A the matrix @p A
+     * @param[in] B the matrix @p B
+     * @param[in] beta the scalar beta value
+     * @param[in,out] C the matrix @p C, also used as result matrix
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        num_rows_{ num_rows },
+        num_rhs_{ num_rhs },
+        num_mirror_rows_{ num_mirror_rows },
+        device_specific_num_rows_{ device_specific_num_rows },
+        row_offset_{ row_offset },
+        alpha_{ alpha },
+        A_{ A },
+        B_{ B },
+        beta_{ beta },
+        C_{ C },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] idx indices representing the current point in the execution space
+     */
+    void operator()(::sycl::item<2> idx) const {
+        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // calculate the indices used in the current work-item
+        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+
+        // create a work-item private array used for internal caching
+        real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
+
+        // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim = 0; dim < device_specific_num_rows_; ++dim) {
+            // perform the feature reduction calculation
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    const auto global_i = i + static_cast<std::size_t>(internal_i);
+                    const auto global_j = j + static_cast<std::size_t>(internal_j);
+
+                    temp[internal_i][internal_j] += A_[(dim) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows_ - dim + global_j] * B_[(dim + row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                }
+            }
+        }
+
+        // apply the (remaining) BLAS operation and update C
+        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                const auto global_i = i + static_cast<std::size_t>(internal_i);
+                const auto partial_global_j = j + static_cast<std::size_t>(internal_j);
+                const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast<std::size_t>(internal_j);
+
+                // be sure to not perform out of bounds accesses
+                if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) {
+                    C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                }
+            }
+        }
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_rows_;
+    const std::size_t num_rhs_;
+    const std::size_t num_mirror_rows_;
+    const std::size_t device_specific_num_rows_;
+    const std::size_t row_offset_;
+    const real_type alpha_;
+    const real_type *A_;
+    const real_type *B_;
+    const real_type beta_;
+    real_type *C_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Perform a simple inplace matrix addition: lhs += rhs.
+ * @details Uses SYCL's basic data parallel kernels.
+ */
+class device_kernel_inplace_matrix_add {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] num_cols the number of columns in both matrices
+     * @param[in,out] lhs the first matrix (updated inplace)
+     * @param[in] rhs the second matrix
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        num_cols_{ num_cols },
+        lhs_{ lhs },
+        rhs_{ rhs },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] idx indices representing the current point in the execution space
+     */
+    void operator()(::sycl::item<2> idx) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // calculate the indices used in the current work-item
+        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+
+        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                const auto global_i = i + static_cast<std::size_t>(internal_i);
+                const auto global_j = j + static_cast<std::size_t>(internal_j);
+
+                lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j];
+            }
+        }
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_cols_;
+    real_type *lhs_;
+    const real_type *rhs_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Perform a simple inplace matrix scale: lhs *= scalar.
+ * @details Uses SYCL's basic data parallel kernels.
+ */
+class device_kernel_inplace_matrix_scale {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] num_cols the number of columns in the matrix
+     * @param[in,out] lhs the first matrix (updated inplace)
+     * @param[in] scale the value to scale
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        num_cols_{ num_cols },
+        lhs_{ lhs },
+        scale_{ scale },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] idx indices representing the current point in the execution space
+     */
+    void operator()(::sycl::item<2> idx) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // calculate the indices used in the current work-item
+        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+
+        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                const auto global_i = i + static_cast<std::size_t>(internal_i);
+                const auto global_j = j + static_cast<std::size_t>(internal_j);
+
+                lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_;
+            }
+        }
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_cols_;
+    real_type *lhs_;
+    const real_type scale_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+}  // namespace plssvm::sycl::detail::basic
+
+#endif  // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp
new file mode 100644
index 000000000..65587ddaa
--- /dev/null
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp
@@ -0,0 +1,139 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Functions for explicitly assembling the kernel matrix using the SYCL backend and the basic data parallel kernels.
+ */
+
+#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_HPP_
+#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_HPP_
+#pragma once
+
+#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+
+#include "sycl/sycl.hpp"  // sycl::item
+
+#include <cstddef>  // std::size_t
+#include <tuple>    // std::tuple, std::make_tuple
+
+namespace plssvm::sycl::detail::basic {
+
+/**
+ * @brief Create the explicit kernel matrix using the @p kernel_function.
+ * @details Uses SYCL's basic data parallel kernels.
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
+ */
+template <kernel_function_type kernel_function, typename... Args>
+class device_kernel_assembly {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[out] kernel_matrix_d the calculated kernel matrix
+     * @param[in] data_d the data points to calculate the kernel matrix from
+     * @param[in] num_rows the number of data points
+     * @param[in] device_num_rows the number of rows the current device is responsible for
+     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] num_features the number of features per data point
+     * @param[in] q the vector used in the dimensional reduction
+     * @param[in] QA_cost the scalar used in the dimensional reduction
+     * @param[in] cost the cost factor the diagonal is scaled with
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
+     */
+    device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        kernel_matrix_d_{ kernel_matrix_d },
+        data_d_{ data_d },
+        num_rows_{ num_rows },
+        device_num_rows_{ device_num_rows },
+        row_offset_{ row_offset },
+        num_features_{ num_features },
+        q_{ q },
+        QA_cost_{ QA_cost },
+        cost_{ cost },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } {
+    }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] idx indices representing the current point in the execution space
+     */
+    void operator()(::sycl::item<2> idx) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // calculate the indices used in the current work-item
+        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+
+        if (i >= j) {
+            // create a work-item private array used for internal caching
+            real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
+
+            for (std::size_t dim = 0; dim < num_features_; ++dim) {
+                // perform the feature reduction calculation
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
+                        const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+                        temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i],
+                                                                                                data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]);
+                    }
+                }
+            }
+
+            // apply the remaining part of the kernel function and store the value in the output kernel matrix
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    // calculate the indices to access the kernel matrix (the part stored on the current device)
+                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
+                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
+                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
+                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+
+                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
+                    if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                        real_type temp_ij = temp[internal_i][internal_j];
+                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                        // apply the cost on the diagonal
+                        if (global_i == global_j) {
+                            temp_ij += cost_;
+                        }
+                        // update the kernel matrix
+                        kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                    }
+                }
+            }
+        }
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    real_type *kernel_matrix_d_;
+    const real_type *data_d_;
+    const std::size_t num_rows_;
+    const std::size_t device_num_rows_;
+    const std::size_t row_offset_;
+    const std::size_t num_features_;
+    const real_type *q_;
+    const real_type QA_cost_;
+    const real_type cost_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::tuple<Args...> kernel_function_parameter_;
+    /// @endcond
+};
+
+}  // namespace plssvm::sycl::detail::basic
+
+#endif  // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp
new file mode 100644
index 000000000..de6358ec8
--- /dev/null
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp
@@ -0,0 +1,472 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Functions for explicitly performing a BLAS GEMM like matrix-matrix multiplication using the SYCL backend and the hierarchical data parallel kernels.
+ */
+
+#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_
+#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_
+#pragma once
+
+#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+
+#include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
+
+#include <cstddef>  // std::size_t
+
+namespace plssvm::sycl::detail::hierarchical {
+
+/**
+ * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
+ * @details Uses SYCL's hierarchical data parallel kernels.
+ */
+class device_kernel_symm {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] num_rows the number of rows in @p A and @p C
+     * @param[in] num_rhs the number of columns in @p B and @p C
+     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] alpha the scalar alpha value
+     * @param[in] A the matrix @p A
+     * @param[in] B the matrix @p B
+     * @param[in] beta the scalar beta value
+     * @param[in,out] C the matrix @p C, also used as result matrix
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        num_rows_{ num_rows },
+        num_rhs_{ num_rhs },
+        device_specific_num_rows_{ device_specific_num_rows },
+        row_offset_{ row_offset },
+        alpha_{ alpha },
+        A_{ A },
+        B_{ B },
+        beta_{ beta },
+        C_{ C },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] group indices representing the current point in the execution space
+     */
+    void operator()(::sycl::group<2> group) const {
+        // allocate shared memory
+        real_type A_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type B_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+        // calculate the indices used in the current work-item
+        ::sycl::private_memory<std::size_t, 2> i{ group };
+        ::sycl::private_memory<std::size_t, 2> i_linear{ group };
+        ::sycl::private_memory<std::size_t, 2> j{ group };
+        ::sycl::private_memory<std::size_t, 2> j_linear{ group };
+
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
+
+        // initialize private and local variables
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
+            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
+            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
+            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
+            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+            // indices
+            i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+            j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+            j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+
+            // initialize private temp matrix to zero
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
+                }
+            }
+        });
+
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+            // load data into local memory
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                const std::size_t threadIdx_x = idx.get_local_id(0);
+
+                const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    const auto global_i = i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j = j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // determine on which side of the diagonal we are located
+                    if (dim + threadIdx_x < global_j) {
+                        A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];
+                    } else {
+                        A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                    }
+                    // determine on which side of the diagonal we are located
+                    if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) {
+                        A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }];
+                    } else {
+                        A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                    }
+
+                    B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                    B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                }
+            });
+
+            // implicit barrier
+
+            // perform the dot product calculation
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                        }
+                    }
+                }
+            });
+
+            // implicit barrier
+        }
+
+        // apply the (partial) BLAS operation and update C
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    const auto global_i = i(idx) + static_cast<std::size_t>(internal_i);
+                    const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
+                    const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
+
+                    // be sure to not perform out of bounds accesses
+                    if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) {
+                        C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                    }
+                }
+            }
+        });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_rows_;
+    const std::size_t num_rhs_;
+    const std::size_t device_specific_num_rows_;
+    const std::size_t row_offset_;
+    const real_type alpha_;
+    const real_type *A_;
+    const real_type *B_;
+    const real_type beta_;
+    real_type *C_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
+ * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
+ *          Uses SYCL's hierarchical data parallel kernels.
+ */
+class device_kernel_symm_mirror {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] num_rows the number of rows in @p A and @p C
+     * @param[in] num_rhs the number of columns in @p B and @p C
+     * @param[in] num_mirror_rows the number of rows to mirror down
+     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] alpha the scalar alpha value
+     * @param[in] A the matrix @p A
+     * @param[in] B the matrix @p B
+     * @param[in] beta the scalar beta value
+     * @param[in,out] C the matrix @p C, also used as result matrix
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        num_rows_{ num_rows },
+        num_rhs_{ num_rhs },
+        num_mirror_rows_{ num_mirror_rows },
+        device_specific_num_rows_{ device_specific_num_rows },
+        row_offset_{ row_offset },
+        alpha_{ alpha },
+        A_{ A },
+        B_{ B },
+        beta_{ beta },
+        C_{ C },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] group indices representing the current point in the execution space
+     */
+    void operator()(::sycl::group<2> group) const {
+        // allocate shared memory
+        real_type A_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type B_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+        // calculate the indices used in the current work-item
+        ::sycl::private_memory<std::size_t, 2> i{ group };
+        ::sycl::private_memory<std::size_t, 2> i_linear{ group };
+        ::sycl::private_memory<std::size_t, 2> j{ group };
+        ::sycl::private_memory<std::size_t, 2> j_linear{ group };
+
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
+
+        // initialize private and local variables
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
+            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
+            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
+            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
+            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+            // indices and diagonal condition
+            i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+            j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+            j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+
+            // initialize private temp matrix to zero
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
+                }
+            }
+        });
+
+        // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+            // load data into shared memory
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                const std::size_t threadIdx_x = idx.get_local_id(0);
+
+                const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    const auto global_i = i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j = j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                    A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j];
+                    A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j];
+
+                    B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                    B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                }
+            });
+
+            // implicit barrier
+
+            // perform the feature reduction calculation
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                        }
+                    }
+                }
+            });
+
+            // implicit barrier
+        }
+
+        // apply the (remaining) BLAS operation and update C
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    const auto global_i = i(idx) + static_cast<std::size_t>(internal_i);
+                    const auto partial_global_j = j(idx) + static_cast<std::size_t>(internal_j);
+                    const auto global_j = row_offset_ + device_specific_num_rows_ + j(idx) + static_cast<std::size_t>(internal_j);
+
+                    // be sure to not perform out of bounds accesses
+                    if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) {
+                        C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                    }
+                }
+            }
+        });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_rows_;
+    const std::size_t num_rhs_;
+    const std::size_t num_mirror_rows_;
+    const std::size_t device_specific_num_rows_;
+    const std::size_t row_offset_;
+    const real_type alpha_;
+    const real_type *A_;
+    const real_type *B_;
+    const real_type beta_;
+    real_type *C_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Perform a simple inplace matrix addition: lhs += rhs.
+ * @details Uses SYCL's hierarchical data parallel kernels.
+ */
+class device_kernel_inplace_matrix_add {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] num_cols the number of columns in both matrices
+     * @param[in,out] lhs the first matrix (updated inplace)
+     * @param[in] rhs the second matrix
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        num_cols_{ num_cols },
+        lhs_{ lhs },
+        rhs_{ rhs },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] group indices representing the current point in the execution space
+     */
+    void operator()(::sycl::group<2> group) const {
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+            const std::size_t threadIdx_x = idx.get_local_id(0);
+            const std::size_t threadIdx_y = idx.get_local_id(1);
+            const std::size_t blockDim_x = idx.get_local_range(0);
+            const std::size_t blockDim_y = idx.get_local_range(1);
+            const std::size_t blockIdx_x = group[0] + grid_x_offset_;
+            const std::size_t blockIdx_y = group[1] + grid_y_offset_;
+            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            // indices
+            const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+
+            for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) {
+                for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) {
+                    const std::size_t global_i = i + internal_i;
+                    const std::size_t global_j = j + internal_j;
+
+                    lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j];
+                }
+            }
+        });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_cols_;
+    real_type *lhs_;
+    const real_type *rhs_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Perform a simple inplace matrix scale: lhs *= scalar.
+ * @details Uses SYCL's hierarchical data parallel kernels.
+ */
+class device_kernel_inplace_matrix_scale {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] num_cols the number of columns in the matrix
+     * @param[in,out] lhs the first matrix (updated inplace)
+     * @param[in] scale the value to scale
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        num_cols_{ num_cols },
+        lhs_{ lhs },
+        scale_{ scale },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] group indices representing the current point in the execution space
+     */
+    void operator()(::sycl::group<2> group) const {
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+            const std::size_t threadIdx_x = idx.get_local_id(0);
+            const std::size_t threadIdx_y = idx.get_local_id(1);
+            const std::size_t blockDim_x = idx.get_local_range(0);
+            const std::size_t blockDim_y = idx.get_local_range(1);
+            const std::size_t blockIdx_x = group[0] + grid_x_offset_;
+            const std::size_t blockIdx_y = group[1] + grid_y_offset_;
+            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            // indices
+            const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+
+            for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) {
+                for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) {
+                    const std::size_t global_i = i + internal_i;
+                    const std::size_t global_j = j + internal_j;
+
+                    lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_;
+                }
+            }
+        });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_cols_;
+    real_type *lhs_;
+    const real_type scale_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+}  // namespace plssvm::sycl::detail::hierarchical
+
+#endif  // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp
new file mode 100644
index 000000000..b09fef0f8
--- /dev/null
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp
@@ -0,0 +1,203 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Functions for explicitly assembling the kernel matrix using the SYCL backend and the hierarchical data parallel kernels.
+ */
+
+#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_HPP_
+#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_HPP_
+#pragma once
+
+#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+
+#include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
+
+#include <cstddef>  // std::size_t
+#include <tuple>    // std::tuple, std::make_tuple
+
+namespace plssvm::sycl::detail::hierarchical {
+
+/**
+ * @brief Create the explicit kernel matrix using the @p kernel_function.
+ * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
+ */
+template <kernel_function_type kernel_function, typename... Args>
+class device_kernel_assembly {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[out] kernel_matrix_d the calculated kernel matrix
+     * @param[in] data_d the data points to calculate the kernel matrix from
+     * @param[in] num_rows the number of data points
+     * @param[in] device_num_rows the number of rows the current device is responsible for
+     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] num_features the number of features per data point
+     * @param[in] q the vector used in the dimensional reduction
+     * @param[in] QA_cost the scalar used in the dimensional reduction
+     * @param[in] cost the cost factor the diagonal is scaled with
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
+     */
+    device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        kernel_matrix_d_{ kernel_matrix_d },
+        data_d_{ data_d },
+        num_rows_{ num_rows },
+        device_num_rows_{ device_num_rows },
+        row_offset_{ row_offset },
+        num_features_{ num_features },
+        q_{ q },
+        QA_cost_{ QA_cost },
+        cost_{ cost },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } {
+    }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] group indices representing the current point in the execution space
+     */
+    void operator()(::sycl::group<2> group) const {
+        // allocate shared memory
+        real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+        // calculate the indices used in the current work-item
+        ::sycl::private_memory<std::size_t, 2> i{ group };
+        ::sycl::private_memory<std::size_t, 2> i_linear{ group };
+        ::sycl::private_memory<std::size_t, 2> j{ group };
+        ::sycl::private_memory<std::size_t, 2> j_linear{ group };
+
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
+
+        // initialize private and local variables
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
+            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
+            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
+            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
+            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+            // indices
+            i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+            j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+            j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+
+            // initialize private temp matrix to zero
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
+                }
+            }
+        });
+
+        // implicit group barrier
+
+        // exploit symmetry
+        if (group[1] >= group[0]) {
+            for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                // load data into shared memory
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                    const std::size_t threadIdx_x = idx.get_local_id(0);
+
+                    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        const auto global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                        data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
+                        data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
+                        data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                        data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                    }
+                });
+
+                // implicit group barrier
+
+                // perform the feature reduction calculation
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                    for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                             data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                            }
+                        }
+                    }
+                });
+
+                // implicit barrier
+            }
+
+            // apply the remaining part of the kernel function and store the value in the output kernel matrix
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the kernel matrix (the part stored on the current device)
+                        const auto device_global_i = i(idx) + static_cast<std::size_t>(internal_i);
+                        const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
+                        const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
+                        const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
+
+                        // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
+                        if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                            real_type temp_ij = temp(idx)[internal_i][internal_j];
+                            temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                            // apply the cost on the diagonal
+                            if (global_i == global_j) {
+                                temp_ij += cost_;
+                            }
+                            // update the kernel matrix
+                            kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                        }
+                    }
+                }
+            });
+        }
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    real_type *kernel_matrix_d_;
+    const real_type *data_d_;
+    const std::size_t num_rows_;
+    const std::size_t device_num_rows_;
+    const std::size_t row_offset_;
+    const std::size_t num_features_;
+    const real_type *q_;
+    const real_type QA_cost_;
+    const real_type cost_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::tuple<Args...> kernel_function_parameter_;
+    /// @endcond
+};
+
+}  // namespace plssvm::sycl::detail::hierarchical
+
+#endif  // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp
new file mode 100644
index 000000000..9e8500d73
--- /dev/null
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp
@@ -0,0 +1,457 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Functions for explicitly performing a BLAS GEMM like matrix-matrix multiplication using the SYCL backend and AdaptiveCpp's scoped parallelism.
+ */
+
+#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_
+#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_
+#pragma once
+
+#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+
+#include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
+
+#include <cstddef>  // std::size_t
+
+namespace plssvm::sycl::detail::scoped {
+
+/**
+ * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
+ * @details Uses AdaptiveCpp's scoped parallelism.
+ */
+class device_kernel_symm {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] num_rows the number of rows in @p A and @p C
+     * @param[in] num_rhs the number of columns in @p B and @p C
+     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] alpha the scalar alpha value
+     * @param[in] A the matrix @p A
+     * @param[in] B the matrix @p B
+     * @param[in] beta the scalar beta value
+     * @param[in,out] C the matrix @p C, also used as result matrix
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        num_rows_{ num_rows },
+        num_rhs_{ num_rhs },
+        device_specific_num_rows_{ device_specific_num_rows },
+        row_offset_{ row_offset },
+        alpha_{ alpha },
+        A_{ A },
+        B_{ B },
+        beta_{ beta },
+        C_{ C },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @tparam T the implementation defined type of the group to iterate
+     * @param[in] group group representing the current point in the execution space
+     */
+    template <typename T>
+    void operator()(T group) const {
+        ::sycl::memory_environment(group,
+                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &A_cache, auto &B_cache, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) {
+                                       // initialize private and local variables
+                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
+                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
+                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
+                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
+                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
+                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
+
+                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+                                           // indices
+                                           i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                                           i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                           j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+                                           j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                       });
+
+                                       for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                           // load data into shared memory
+                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                               const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+
+                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                               for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                   const auto global_i = i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   const auto global_j = j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                   // determine on which side of the diagonal we are located
+                                                   if (dim + threadIdx_x < global_j) {
+                                                       A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                                   } else {
+                                                       A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                                   }
+                                                   // determine on which side of the diagonal we are located
+                                                   if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) {
+                                                       A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                                   } else {
+                                                       A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                                   }
+
+                                                   B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                                                   B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                                               }
+                                           });
+
+                                           // perform calculations
+                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                               const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                               for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                                   for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                       for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                           temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                                                       }
+                                                   }
+                                               }
+                                           });
+                                       }
+
+                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                               for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                   const auto global_i = i(idx) + static_cast<std::size_t>(internal_i);
+                                                   const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
+                                                   const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
+
+                                                   // be sure to not perform out of bounds accesses
+                                                   if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) {
+                                                       C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                                                   }
+                                               }
+                                           }
+                                       });
+                                   });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_rows_;
+    const std::size_t num_rhs_;
+    const std::size_t device_specific_num_rows_;
+    const std::size_t row_offset_;
+    const real_type alpha_;
+    const real_type *A_;
+    const real_type *B_;
+    const real_type beta_;
+    real_type *C_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
+ * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
+ *          Uses AdaptiveCpp's scoped parallelism.
+ */
+class device_kernel_symm_mirror {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] num_rows the number of rows in @p A and @p C
+     * @param[in] num_rhs the number of columns in @p B and @p C
+     * @param[in] num_mirror_rows the number of rows to mirror down
+     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] alpha the scalar alpha value
+     * @param[in] A the matrix @p A
+     * @param[in] B the matrix @p B
+     * @param[in] beta the scalar beta value
+     * @param[in,out] C the matrix @p C, also used as result matrix
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        num_rows_{ num_rows },
+        num_rhs_{ num_rhs },
+        num_mirror_rows_{ num_mirror_rows },
+        device_specific_num_rows_{ device_specific_num_rows },
+        row_offset_{ row_offset },
+        alpha_{ alpha },
+        A_{ A },
+        B_{ B },
+        beta_{ beta },
+        C_{ C },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @tparam T the implementation defined type of the group to iterate
+     * @param[in] group group representing the current point in the execution space
+     */
+    template <typename T>
+    void operator()(T group) const {
+        ::sycl::memory_environment(group,
+                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &A_cache, auto &B_cache, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) {
+                                       // initialize private and local variables
+                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
+                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
+                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
+                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
+                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
+                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
+
+                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+                                           // indices
+                                           i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                                           i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                           j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+                                           j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                       });
+
+                                       for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                           // load data into shared memory
+                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                               const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+
+                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                               for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                   const auto global_i = i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   const auto global_j = j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                   // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                                                   A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j];
+                                                   A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j];
+
+                                                   B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                                                   B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                                               }
+                                           });
+
+                                           // perform calculations
+                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                               const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                               for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                                   for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                       for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                           temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                                                       }
+                                                   }
+                                               }
+                                           });
+                                       }
+
+                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                               for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                   const auto global_i = i(idx) + static_cast<std::size_t>(internal_i);
+                                                   const auto partial_global_j = j(idx) + static_cast<std::size_t>(internal_j);
+                                                   const auto global_j = row_offset_ + device_specific_num_rows_ + j(idx) + static_cast<std::size_t>(internal_j);
+
+                                                   // be sure to not perform out of bounds accesses
+                                                   if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) {
+                                                       C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                                                   }
+                                               }
+                                           }
+                                       });
+                                   });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_rows_;
+    const std::size_t num_rhs_;
+    const std::size_t num_mirror_rows_;
+    const std::size_t device_specific_num_rows_;
+    const std::size_t row_offset_;
+    const real_type alpha_;
+    const real_type *A_;
+    const real_type *B_;
+    const real_type beta_;
+    real_type *C_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Perform a simple inplace matrix addition: lhs += rhs.
+ * @details Uses AdaptiveCpp's scoped parallelism.
+ */
+class device_kernel_inplace_matrix_add {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] num_cols the number of columns in both matrices
+     * @param[in,out] lhs the first matrix (updated inplace)
+     * @param[in] rhs the second matrix
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        num_cols_{ num_cols },
+        lhs_{ lhs },
+        rhs_{ rhs },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @tparam T the implementation defined type of the group to iterate
+     * @param[in] group group representing the current point in the execution space
+     */
+    template <typename T>
+    void operator()(T group) const {
+        ::sycl::memory_environment(group,
+                                   [&]() {
+                                       // scale
+                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                           // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);
+                                           const std::size_t blockDim_x = group.get_logical_local_range(0);
+                                           const std::size_t blockDim_y = group.get_logical_local_range(1);
+                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;
+                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;
+                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           // indices
+                                           const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                                           const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+
+                                           for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) {
+                                               for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) {
+                                                   const std::size_t global_i = i + internal_i;
+                                                   const std::size_t global_j = j + internal_j;
+
+                                                   lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j];
+                                               }
+                                           }
+                                       });
+                                   });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_cols_;
+    real_type *lhs_;
+    const real_type *rhs_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Perform a simple inplace matrix scale: lhs *= scalar.
+ * @details Uses AdaptiveCpp's scoped parallelism.
+ */
+class device_kernel_inplace_matrix_scale {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] num_cols the number of columns in the matrix
+     * @param[in,out] lhs the first matrix (updated inplace)
+     * @param[in] scale the value to scale
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        num_cols_{ num_cols },
+        lhs_{ lhs },
+        scale_{ scale },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @tparam T the implementation defined type of the group to iterate
+     * @param[in] group group representing the current point in the execution space
+     */
+    template <typename T>
+    void operator()(T group) const {
+        ::sycl::memory_environment(group,
+                                   [&]() {
+                                       // scale
+                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                           // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);
+                                           const std::size_t blockDim_x = group.get_logical_local_range(0);
+                                           const std::size_t blockDim_y = group.get_logical_local_range(1);
+                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;
+                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;
+                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           // indices
+                                           const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                                           const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+
+                                           for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) {
+                                               for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) {
+                                                   const std::size_t global_i = i + internal_i;
+                                                   const std::size_t global_j = j + internal_j;
+
+                                                   lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_;
+                                               }
+                                           }
+                                       });
+                                   });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const std::size_t num_cols_;
+    real_type *lhs_;
+    const real_type scale_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+}  // namespace plssvm::sycl::detail::scoped
+
+#endif  // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp
new file mode 100644
index 000000000..4ed3764ce
--- /dev/null
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp
@@ -0,0 +1,189 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Functions for explicitly assembling the kernel matrix using the SYCL backend and AdaptiveCpp's scoped parallelism.
+ */
+
+#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_HPP_
+#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_HPP_
+#pragma once
+
+#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+
+#include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
+
+#include <cstddef>  // std::size_t
+#include <tuple>    // std::tuple, std::make_tuple
+
+namespace plssvm::sycl::detail::scoped {
+
+/**
+ * @brief Create the explicit kernel matrix using the @p kernel_function.
+ * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
+ */
+template <kernel_function_type kernel_function, typename... Args>
+class device_kernel_assembly {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[out] kernel_matrix_d the calculated kernel matrix
+     * @param[in] data_d the data points to calculate the kernel matrix from
+     * @param[in] num_rows the number of data points
+     * @param[in] device_num_rows the number of rows the current device is responsible for
+     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] num_features the number of features per data point
+     * @param[in] q the vector used in the dimensional reduction
+     * @param[in] QA_cost the scalar used in the dimensional reduction
+     * @param[in] cost the cost factor the diagonal is scaled with
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
+     */
+    device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        kernel_matrix_d_{ kernel_matrix_d },
+        data_d_{ data_d },
+        num_rows_{ num_rows },
+        device_num_rows_{ device_num_rows },
+        row_offset_{ row_offset },
+        num_features_{ num_features },
+        q_{ q },
+        QA_cost_{ QA_cost },
+        cost_{ cost },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } {
+    }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @tparam T the implementation defined type of the group to iterate
+     * @param[in] group group representing the current point in the execution space
+     */
+    template <typename T>
+    void operator()(T group) const {
+        ::sycl::memory_environment(group,
+                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &data_cache_i, auto &data_cache_j, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) {
+                                       // initialize private and local variables
+                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
+                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
+                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
+                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
+                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
+                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
+
+                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+                                           // indices
+                                           i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                                           i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                           j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+                                           j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                       });
+
+                                       // exploit symmetry
+                                       if (group[1] >= group[0]) {
+                                           for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                               // load data into shared memory
+                                               ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                   const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                   const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+
+                                                   const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                   const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                                   for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                       const auto global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                       const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                       // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                                                       data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
+                                                       data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
+                                                       data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                                                       data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                                                   }
+                                               });
+
+                                               // perform calculations
+                                               ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                   const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                   for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                                       for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                           for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                               temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                                            data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                                           }
+                                                       }
+                                                   }
+                                               });
+                                           }
+
+                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                               for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                   for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                       // calculate the indices to access the kernel matrix (the part stored on the current device)
+                                                       const auto device_global_i = i(idx) + static_cast<std::size_t>(internal_i);
+                                                       const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
+                                                       const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
+                                                       const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
+
+                                                       // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
+                                                       if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                                                           real_type temp_ij = temp(idx)[internal_i][internal_j];
+                                                           temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                                                           // apply the cost on the diagonal
+                                                           if (global_i == global_j) {
+                                                               temp_ij += cost_;
+                                                           }
+                                                           // update the kernel matrix
+                                                           kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                                                       }
+                                                   }
+                                               }
+                                           });
+                                       }
+                                   });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    real_type *kernel_matrix_d_;
+    const real_type *data_d_;
+    const std::size_t num_rows_;
+    const std::size_t device_num_rows_;
+    const std::size_t row_offset_;
+    const std::size_t num_features_;
+    const real_type *q_;
+    const real_type QA_cost_;
+    const real_type cost_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::tuple<Args...> kernel_function_parameter_;
+    /// @endcond
+};
+
+}  // namespace plssvm::sycl::detail::scoped
+
+#endif  // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
similarity index 94%
rename from include/plssvm/backends/SYCL/kernel/cg_explicit/blas.hpp
rename to include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
index c5cfca67f..ae07f7ec6 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
@@ -6,23 +6,24 @@
  * @license This file is part of the PLSSVM project which is released under the MIT license.
  *          See the LICENSE.md file in the project root for full license information.
  *
- * @brief Functions for explicitly performing a BLAS GEMM like matrix-matrix multiplication using the SYCL backend.
+ * @brief Functions for explicitly performing a BLAS GEMM like matrix-matrix multiplication using the SYCL backend and the work-group data parallel kernels.
  */
 
-#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BLAS_HPP_
-#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BLAS_HPP_
+#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_
+#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_
 #pragma once
 
 #include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
 
-#include "sycl/sycl.hpp"  // sycl::nd_item
+#include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
 
 #include <cstddef>  // std::size_t
 
-namespace plssvm::sycl::detail {
+namespace plssvm::sycl::detail::work_group {
 
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
+ * @details Uses SYCL's work-group data parallel kernels.
  */
 class device_kernel_symm {
   public:
@@ -87,20 +88,20 @@ class device_kernel_symm {
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
         // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < (num_rows_ - row_offset_); dim += FEATURE_BLOCK_SIZE_uz) {
+        for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += FEATURE_BLOCK_SIZE_uz) {
             // load data into local memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                 const auto global_i = i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
                 const auto global_j = j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                 // determine on which side of the diagonal we are located
-                if (dim + nd_idx.get_local_id(0) < global_j) {
+                if (dim + threadIdx_x < global_j) {
                     A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];
                 } else {
                     A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
                 }
                 // determine on which side of the diagonal we are located
-                if (dim + nd_idx.get_local_id(0) + THREAD_BLOCK_SIZE < global_j) {
+                if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) {
                     A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }];
                 } else {
                     A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
@@ -161,6 +162,7 @@ class device_kernel_symm {
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
+ *          Uses SYCL's work-group data parallel kernels.
  */
 class device_kernel_symm_mirror {
   public:
@@ -234,8 +236,8 @@ class device_kernel_symm_mirror {
                 const auto global_j = j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                 // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + nd_idx.get_local_id(0)) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + nd_idx.get_local_id(0)) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + nd_idx.get_local_id(0)) + global_j];
-                A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + nd_idx.get_local_id(0) + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + nd_idx.get_local_id(0) + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + nd_idx.get_local_id(0) + THREAD_BLOCK_SIZE_uz) + global_j];
+                A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j];
+                A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j];
 
                 B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
                 B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
@@ -292,6 +294,7 @@ class device_kernel_symm_mirror {
 
 /**
  * @brief Perform a simple inplace matrix addition: lhs += rhs.
+ * @details Uses SYCL's work-group data parallel kernels.
  */
 class device_kernel_inplace_matrix_add {
   public:
@@ -351,6 +354,7 @@ class device_kernel_inplace_matrix_add {
 
 /**
  * @brief Perform a simple inplace matrix scale: lhs *= scalar.
+ * @details Uses SYCL's work-group data parallel kernels.
  */
 class device_kernel_inplace_matrix_scale {
   public:
@@ -408,6 +412,6 @@ class device_kernel_inplace_matrix_scale {
     /// @endcond
 };
 
-}  // namespace plssvm::sycl::detail
+}  // namespace plssvm::sycl::detail::work_group
 
-#endif  // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BLAS_HPP_
+#endif  // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
similarity index 95%
rename from include/plssvm/backends/SYCL/kernel/cg_explicit/kernel_matrix_assembly.hpp
rename to include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
index a00fa2d4a..96030fbe7 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
@@ -6,26 +6,27 @@
  * @license This file is part of the PLSSVM project which is released under the MIT license.
  *          See the LICENSE.md file in the project root for full license information.
  *
- * @brief Functions for explicitly assembling the kernel matrix using the SYCL backend.
+ * @brief Functions for explicitly assembling the kernel matrix using the SYCL backend and the work-group data parallel kernels.
  */
 
-#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_KERNEL_MATRIX_ASSEMBLY_HPP_
-#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_KERNEL_MATRIX_ASSEMBLY_HPP_
+#ifndef PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_HPP_
+#define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_HPP_
 #pragma once
 
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
-#include "sycl/sycl.hpp"  // sycl::nd_item
+#include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
 
 #include <cstddef>  // std::size_t
 #include <tuple>    // std::tuple, std::make_tuple
 
-namespace plssvm::sycl::detail {
+namespace plssvm::sycl::detail::work_group {
 
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
+ * @details Uses SYCL's work-group data parallel kernels.
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
@@ -171,6 +172,6 @@ class device_kernel_assembly {
     /// @endcond
 };
 
-}  // namespace plssvm::sycl::detail
+}  // namespace plssvm::sycl::detail::work_group
 
-#endif  // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_KERNEL_MATRIX_ASSEMBLY_HPP_
+#endif  // PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp
new file mode 100644
index 000000000..7b517a7b1
--- /dev/null
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp
@@ -0,0 +1,159 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Functions for implicitly assembling the kernel matrix using the SYCL backend and the basic data parallel kernels.
+ */
+
+#ifndef PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
+#define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
+#pragma once
+
+#include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
+#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+
+#include "sycl/sycl.hpp"  // sycl::item
+
+#include <cstddef>  // std::size_t
+#include <tuple>    // std::tuple, std::make_tuple
+
+namespace plssvm::sycl::detail::basic {
+
+/**
+ * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
+ * @details Uses SYCL's basic data parallel kernels.
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function
+ */
+template <kernel_function_type kernel_function, typename... Args>
+class device_kernel_assembly_symm {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] alpha the scalar alpha value
+     * @param[in] q the vector used in the dimensional reduction
+     * @param[in] data_d the data points to calculate the implicit kernel matrix from
+     * @param[in] num_rows the total number of data points (= total number of rows)
+     * @param[in] device_num_rows the number of rows the current device is responsible for
+     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] num_features the number of features per data point
+     * @param[in] QA_cost the scalar used in the dimensional reduction
+     * @param[in] cost the cost factor the diagonal is scaled with
+     * @param[in] B the matrix @p B
+     * @param[in,out] C the matrix @p C
+     * @param[in] num_classes the number of classes in the data set
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
+     */
+    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        alpha_{ alpha },
+        q_{ q },
+        data_d_{ data_d },
+        num_rows_{ num_rows },
+        device_num_rows_{ device_num_rows },
+        row_offset_{ row_offset },
+        num_features_{ num_features },
+        QA_cost_{ QA_cost },
+        cost_{ cost },
+        B_{ B },
+        C_{ C },
+        num_classes_{ num_classes },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] idx indices representing the current point in the execution space
+     */
+    void operator()(::sycl::item<2> idx) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // calculate the indices used in the current work-item
+        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+
+        // only calculate the upper triangular matrix
+        if (i >= j) {
+            // create a work-item private array used for internal caching
+            real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
+
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim = 0; dim < num_features_; ++dim) {
+                // perform the feature reduction calculation
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
+                        const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+
+                        temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i],
+                                                                                                data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]);
+                    }
+                }
+            }
+
+            // apply the remaining part of the kernel function and store the value in the output kernel matrix
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
+                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
+                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
+
+                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
+                    if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                        real_type temp_ij = temp[internal_i][internal_j];
+                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                        // apply the cost on the diagonal
+                        if (global_i == global_j) {
+                            temp_ij += cost_;
+                            // calculate the values of alpha * A * B
+                            for (std::size_t class_idx = 0; class_idx < num_classes_; ++class_idx) {
+                                detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx];
+                            }
+                        } else {
+                            // calculate the values of alpha * A * B
+                            for (std::size_t class_idx = 0; class_idx < num_classes_; ++class_idx) {
+                                detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + class_idx];
+                                // symmetry
+                                detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const real_type alpha_;
+    const real_type *q_;
+    const real_type *data_d_;
+    const std::size_t num_rows_;
+    const std::size_t device_num_rows_;
+    const std::size_t row_offset_;
+    const std::size_t num_features_;
+    const real_type QA_cost_;
+    const real_type cost_;
+    const real_type *B_;
+    real_type *C_;
+    const std::size_t num_classes_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::tuple<Args...> kernel_function_parameter_;
+    /// @endcond
+};
+
+}  // namespace plssvm::sycl::detail::basic
+
+#endif  // PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp
new file mode 100644
index 000000000..d2f7b0a5c
--- /dev/null
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp
@@ -0,0 +1,366 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Functions for implicitly assembling the kernel matrix using the SYCL backend and the hierarchical data parallel kernels.
+ */
+
+#ifndef PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
+#define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
+#pragma once
+
+#include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
+#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+
+#include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
+
+#include <cstddef>  // std::size_t
+#include <tuple>    // std::tuple, std::make_tuple
+
+namespace plssvm::sycl::detail::hierarchical {
+
+/**
+ * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
+ * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function
+ */
+template <kernel_function_type kernel_function, typename... Args>
+class device_kernel_assembly_symm {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] alpha the scalar alpha value
+     * @param[in] q the vector used in the dimensional reduction
+     * @param[in] data_d the data points to calculate the implicit kernel matrix from
+     * @param[in] num_rows the total number of data points (= total number of rows)
+     * @param[in] device_num_rows the number of rows the current device is responsible for
+     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] num_features the number of features per data point
+     * @param[in] QA_cost the scalar used in the dimensional reduction
+     * @param[in] cost the cost factor the diagonal is scaled with
+     * @param[in] B the matrix @p B
+     * @param[in,out] C the matrix @p C
+     * @param[in] num_classes the number of classes in the data set
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
+     */
+    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        alpha_{ alpha },
+        q_{ q },
+        data_d_{ data_d },
+        num_rows_{ num_rows },
+        device_num_rows_{ device_num_rows },
+        row_offset_{ row_offset },
+        num_features_{ num_features },
+        QA_cost_{ QA_cost },
+        cost_{ cost },
+        B_{ B },
+        C_{ C },
+        num_classes_{ num_classes },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] group indices representing the current point in the execution space
+     */
+    void operator()(::sycl::group<2> group) const {
+        // allocate shared memory
+        real_type data_cache_i[FEATURE_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type data_cache_j[FEATURE_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+        // calculate the indices used in the current work-item
+        ::sycl::private_memory<std::size_t, 2> i{ group };
+        ::sycl::private_memory<std::size_t, 2> i_linear{ group };
+        ::sycl::private_memory<std::size_t, 2> j{ group };
+        ::sycl::private_memory<std::size_t, 2> j_linear{ group };
+
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
+
+        // initialize private and local variables
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
+            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
+            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
+            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
+            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+            // indices
+            i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+            j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+            j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+
+            // initialize private temp matrix to zero
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
+                }
+            }
+        });
+
+        // implicit group barrier
+
+        // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
+        if (group[1] >= group[0]) {
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                // load data into local memory
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                    const std::size_t threadIdx_x = idx.get_local_id(0);
+
+                    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        const auto global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                        data_cache_i[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
+                        data_cache_i[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
+                        data_cache_j[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                        data_cache_j[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                    }
+                });
+
+                // implicit group barrier
+
+                // perform the feature reduction calculation
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                    for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                             data_cache_j[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                            }
+                        }
+                    }
+                });
+
+                // implicit group barrier
+            }
+
+            // apply the remaining part of the kernel function and store the value in the output kernel matrix
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
+                        const auto device_global_i = i(idx) + static_cast<std::size_t>(internal_i);
+                        const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
+                        const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
+
+                        // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
+                        if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                            temp(idx)[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                            // apply the cost on the diagonal
+                            if (global_i == global_j) {
+                                temp(idx)[internal_i][internal_j] += cost_;
+                            }
+                        } else {
+                            // be sure to set the value to zero otherwise
+                            temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
+                        }
+                    }
+                }
+            });
+
+            // implicit group barrier
+
+            // calculate C += alpha * temp * B for the UPPER triangular matrix
+            {
+                // allocate shared memory
+                auto &B_cache = data_cache_i;      // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]
+                auto &C_out_cache = data_cache_j;  // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]
+
+                // iterate over all classes using blocking to be able to cache them for faster memory accesses
+                for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                    // load data into local memory
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                        const std::size_t threadIdx_x = idx.get_local_id(0);
+
+                        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                            const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                            B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
+                            B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
+                            C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = real_type{ 0.0 };
+                            C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 };
+                        }
+                    });
+
+                    // implicit group barrier
+
+                    // calculate intermediate results and store them in shared memory
+                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                            const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                            const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE] +=
+                                        temp(idx)[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE];
+                                }
+                            }
+                        });
+
+                        // implicit group barrier
+                    }
+
+                    // add intermediate cached results to C
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                        const std::size_t threadIdx_y = idx.get_local_id(1);
+
+                        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                            const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal);
+                            detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1];
+                            detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1 + THREAD_BLOCK_SIZE];
+                        }
+                    });
+
+                    // implicit group barrier
+                }
+            }
+
+            // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
+                        const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
+
+                        if (global_i == global_j) {
+                            temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
+                        }
+                    }
+                }
+            });
+
+            // implicit group barrier
+
+            // calculate C += alpha * temp * B for the LOWER triangular matrix
+            {
+                // allocate shared memory
+                auto &B_cache = data_cache_i;      // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto &C_out_cache = data_cache_j;  // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+
+                // iterate over all classes using blocking to be able to cache them for faster memory accesses
+                for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                        const std::size_t threadIdx_x = idx.get_local_id(0);
+
+                        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                        // load data into local memory
+                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                            const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                            B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
+                            B_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
+                            C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
+                            C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
+                        }
+                    });
+
+                    // implicit group barrier
+
+                    // calculate intermediate results and store them in shared memory
+                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                            const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                            const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    C_out_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] +=
+                                        temp(idx)[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j];
+                                }
+                            }
+                        });
+
+                        // implicit group barrier
+                    }
+
+                    // add intermediate cached results to C
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                        const std::size_t threadIdx_x = idx.get_local_id(0);
+
+                        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                            const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal);
+                            detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];
+                            detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];
+                        }
+                    });
+
+                    // implicit group barrier
+                }
+            }
+        }
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const real_type alpha_;
+    const real_type *q_;
+    const real_type *data_d_;
+    const std::size_t num_rows_;
+    const std::size_t device_num_rows_;
+    const std::size_t row_offset_;
+    const std::size_t num_features_;
+    const real_type QA_cost_;
+    const real_type cost_;
+    const real_type *B_;
+    real_type *C_;
+    const std::size_t num_classes_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::tuple<Args...> kernel_function_parameter_;
+    /// @endcond
+};
+
+}  // namespace plssvm::sycl::detail::hierarchical
+
+#endif  // PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp
new file mode 100644
index 000000000..4391f2f19
--- /dev/null
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp
@@ -0,0 +1,343 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Functions for implicitly assembling the kernel matrix using the SYCL backend and AdaptiveCpp's scoped parallelism.
+ */
+
+#ifndef PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
+#define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
+#pragma once
+
+#include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
+#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+
+#include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
+
+#include <cstddef>  // std::size_t
+#include <tuple>    // std::tuple, std::make_tuple
+
+namespace plssvm::sycl::detail::scoped {
+
+/**
+ * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
+ * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function
+ */
+template <kernel_function_type kernel_function, typename... Args>
+class device_kernel_assembly_symm {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] alpha the scalar alpha value
+     * @param[in] q the vector used in the dimensional reduction
+     * @param[in] data_d the data points to calculate the implicit kernel matrix from
+     * @param[in] num_rows the total number of data points (= total number of rows)
+     * @param[in] device_num_rows the number of rows the current device is responsible for
+     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] num_features the number of features per data point
+     * @param[in] QA_cost the scalar used in the dimensional reduction
+     * @param[in] cost the cost factor the diagonal is scaled with
+     * @param[in] B the matrix @p B
+     * @param[in,out] C the matrix @p C
+     * @param[in] num_classes the number of classes in the data set
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
+     */
+    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        alpha_{ alpha },
+        q_{ q },
+        data_d_{ data_d },
+        num_rows_{ num_rows },
+        device_num_rows_{ device_num_rows },
+        row_offset_{ row_offset },
+        num_features_{ num_features },
+        QA_cost_{ QA_cost },
+        cost_{ cost },
+        B_{ B },
+        C_{ C },
+        num_classes_{ num_classes },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @tparam T the implementation defined type of the group to iterate
+     * @param[in] group group representing the current point in the execution space
+     */
+    template <typename T>
+    void operator()(T group) const {
+        ::sycl::memory_environment(group,
+                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &data_cache_i, auto &data_cache_j, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) {
+                                       // initialize private and local variables
+                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
+                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
+                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
+                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
+                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
+                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
+
+                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+                                           // indices
+                                           i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                                           i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                           j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+                                           j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                       });
+
+                                       // exploit symmetry
+                                       if (group[1] >= group[0]) {
+                                           // iterate over all features using blocking to be able to cache them for faster memory accesses
+                                           for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                               // load data into local memory
+                                               ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                   const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                   const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+
+                                                   const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                   const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                                   for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                       const auto global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                       const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                       // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                                                       data_cache_i[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
+                                                       data_cache_i[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
+                                                       data_cache_j[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                                                       data_cache_j[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                                                   }
+                                               });
+
+                                               // perform the feature reduction calculation
+                                               ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                   const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                   for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                                       for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                           for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                               temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                                            data_cache_j[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                                           }
+                                                       }
+                                                   }
+                                               });
+                                           }
+
+                                           // apply the remaining part of the kernel function and store the value in the output kernel matrix
+                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                   for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                       const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
+                                                       const auto device_global_i = i(idx) + static_cast<std::size_t>(internal_i);
+                                                       const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
+                                                       const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
+
+                                                       // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
+                                                       if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                                                           temp(idx)[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                                                           // apply the cost on the diagonal
+                                                           if (global_i == global_j) {
+                                                               temp(idx)[internal_i][internal_j] += cost_;
+                                                           }
+                                                       } else {
+                                                           // be sure to set the value to zero otherwise
+                                                           temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
+                                                       }
+                                                   }
+                                               }
+                                           });
+
+                                           // calculate C += alpha * temp * B for the UPPER triangular matrix
+                                           {
+                                               // rename cached arrays
+                                               auto &B_cache = data_cache_i;      // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]
+                                               auto &C_out_cache = data_cache_j;  // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]
+
+                                               // iterate over all classes using blocking to be able to cache them for faster memory accesses
+                                               for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                                   // load data into local memory
+                                                   ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                       const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                       const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+
+                                                       const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                       const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                                       for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                           const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                           // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                                                           B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
+                                                           B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
+                                                           C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = real_type{ 0.0 };
+                                                           C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 };
+                                                       }
+                                                   });
+
+                                                   // calculate intermediate results and store them in shared memory
+                                                   for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                           const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                           const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                           for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                               for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                                   C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE] +=
+                                                                       temp(idx)[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE];
+                                                               }
+                                                           }
+                                                       });
+                                                   }
+
+                                                   // add intermediate cached results to C
+                                                   ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                       const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                       const std::size_t threadIdx_y = idx.get_local_id(group, 1);
+
+                                                       const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                       const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                                       for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                           const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal);
+                                                           detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1];
+                                                           detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1 + THREAD_BLOCK_SIZE];
+                                                       }
+                                                   });
+                                               }
+                                           }
+
+                                           // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C
+                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                   for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                       const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
+                                                       const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
+
+                                                       if (global_i == global_j) {
+                                                           temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
+                                                       }
+                                                   }
+                                               }
+                                           });
+
+                                           // calculate C += alpha * temp * B for the LOWER triangular matrix
+                                           {
+                                               // allocate shared memory
+                                               auto &B_cache = data_cache_i;      // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                                               auto &C_out_cache = data_cache_j;  // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+
+                                               // iterate over all classes using blocking to be able to cache them for faster memory accesses
+                                               for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                                   ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                       const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                       const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+
+                                                       const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                       const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                                       // load data into local memory
+                                                       for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                           const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                           // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                                                           B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
+                                                           B_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
+                                                           C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
+                                                           C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
+                                                       }
+                                                   });
+
+                                                   // implicit group barrier
+
+                                                   // calculate intermediate results and store them in shared memory
+                                                   for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                           const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                           const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                           for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                               for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                                   C_out_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] +=
+                                                                       temp(idx)[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j];
+                                                               }
+                                                           }
+                                                       });
+
+                                                       // implicit group barrier
+                                                   }
+
+                                                   // add intermediate cached results to C
+                                                   ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                       const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                       const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+
+                                                       const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                       const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                                       for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                           const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal);
+                                                           detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];
+                                                           detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];
+                                                       }
+                                                   });
+
+                                                   // implicit group barrier
+                                               }
+                                           }
+                                       }
+                                   });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    const real_type alpha_;
+    const real_type *q_;
+    const real_type *data_d_;
+    const std::size_t num_rows_;
+    const std::size_t device_num_rows_;
+    const std::size_t row_offset_;
+    const std::size_t num_features_;
+    const real_type QA_cost_;
+    const real_type cost_;
+    const real_type *B_;
+    real_type *C_;
+    const std::size_t num_classes_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::tuple<Args...> kernel_function_parameter_;
+    /// @endcond
+};
+
+}  // namespace plssvm::sycl::detail::scoped
+
+#endif  // PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
similarity index 97%
rename from include/plssvm/backends/SYCL/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
rename to include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
index c1a337107..34b55fff4 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
@@ -6,11 +6,11 @@
  * @license This file is part of the PLSSVM project which is released under the MIT license.
  *          See the LICENSE.md file in the project root for full license information.
  *
- * @brief Functions for implicitly assembling the kernel matrix using the SYCL backend.
+ * @brief Functions for implicitly assembling the kernel matrix using the SYCL backend and the work-group data parallel kernels.
  */
 
-#ifndef PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
-#define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
+#ifndef PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
+#define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
 #pragma once
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
@@ -18,15 +18,16 @@
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
-#include "sycl/sycl.hpp"  // sycl::nd_item
+#include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
 
 #include <cstddef>  // std::size_t
 #include <tuple>    // std::tuple, std::make_tuple
 
-namespace plssvm::sycl::detail {
+namespace plssvm::sycl::detail::work_group {
 
 /**
  * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
+ * @details Uses SYCL's work-group data parallel kernels.
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
@@ -275,6 +276,6 @@ class device_kernel_assembly_symm {
     /// @endcond
 };
 
-}  // namespace plssvm::sycl::detail
+}  // namespace plssvm::sycl::detail::work_group
 
-#endif  // PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
+#endif  // PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
new file mode 100644
index 000000000..c16965cb1
--- /dev/null
+++ b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
@@ -0,0 +1,310 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Defines the functions used for prediction for the C-SVM using the SYCL backend and the work-group data parallel kernels.
+ */
+
+#ifndef PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_BASIC_PREDICT_KERNEL_HPP_
+#define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_BASIC_PREDICT_KERNEL_HPP_
+#pragma once
+
+#include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
+#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+
+#include "sycl/sycl.hpp"  // sycl::item
+
+#include <cstddef>  // std::size_t
+#include <tuple>    // std::tuple, std::make_tuple
+
+namespace plssvm::sycl::detail::basic {
+
+/**
+ * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @details Uses SYCL's basic data parallel kernels.
+ */
+class device_kernel_w_linear {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in,out] w_d the vector to speedup the linear prediction
+     * @param[in] alpha_d the previously learned weights
+     * @param[in] sv_d the support vectors
+     * @param[in] num_classes the number of classes
+     * @param[in] num_sv the number of support vectors
+     * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
+     * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        w_d_{ w_d },
+        alpha_d_{ alpha_d },
+        sv_d_{ sv_d },
+        num_classes_{ num_classes },
+        num_sv_{ num_sv },
+        device_specific_num_sv_{ device_specific_num_sv },
+        sv_offset_{ sv_offset },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] idx indices representing the current point in the execution space
+     */
+    void operator()(::sycl::item<2> idx) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // calculate the indices used in the current work-item
+        const std::size_t feature_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+
+        // create a work-item private array used for internal caching
+        real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
+
+        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+        for (std::size_t sv = 0; sv < device_specific_num_sv_; ++sv) {
+            // perform the dot product calculation
+            for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                    const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+
+                    temp[internal_feature][internal_class] += alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_] * sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv];
+                }
+            }
+        }
+
+        // update global array with local one
+        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+
+                w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];
+            }
+        }
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    real_type *w_d_;
+    const real_type *alpha_d_;
+    const real_type *sv_d_;
+    const std::size_t num_classes_;
+    const std::size_t num_sv_;
+    const std::size_t device_specific_num_sv_;
+    const std::size_t sv_offset_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @details Uses SYCL's basic data parallel kernels.
+ */
+class device_kernel_predict_linear {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[out] prediction_d the predicted values
+     * @param[in] w_d the vector to speedup the calculations
+     * @param[in] rho_d the previously learned bias
+     * @param[in] predict_points_d the data points to predict
+     * @param[in] num_classes the number of classes
+     * @param[in] num_predict_points the number of data points to predict
+     * @param[in] num_features the number of features per data point
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        prediction_d_{ prediction_d },
+        w_d_{ w_d },
+        rho_d_{ rho_d },
+        predict_points_d_{ predict_points_d },
+        num_classes_{ num_classes },
+        num_predict_points_{ num_predict_points },
+        num_features_{ num_features },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] idx indices representing the current point in the execution space
+     */
+    void operator()(::sycl::item<2> idx) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // calculate the indices used in the current work-item
+        const std::size_t pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+
+        // create a work-item private array used for internal caching
+        real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
+
+        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim = 0; dim < num_features_; ++dim) {
+            // perform the dot product calculation
+            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                    temp[internal_pd][internal_class] += w_d_[dim * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] * predict_points_d_[dim * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
+                }
+            }
+        }
+
+        // update global array with local one
+        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
+
+                prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx];
+            }
+        }
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    real_type *prediction_d_;
+    const real_type *w_d_;
+    const real_type *rho_d_;
+    const real_type *predict_points_d_;
+    const std::size_t num_classes_;
+    const std::size_t num_predict_points_;
+    const std::size_t num_features_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @details Uses SYCL's basic data parallel kernels.
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
+ */
+template <kernel_function_type kernel_function, typename... Args>
+class device_kernel_predict {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] prediction_d the predicted values
+     * @param[in] alpha_d the previously learned weights
+     * @param[in] rho_d the previously learned biases
+     * @param[in] sv_d the support vectors
+     * @param[in] predict_points_d the data points to predict
+     * @param[in] num_classes the number of classes
+     * @param[in] num_sv the number of support vectors
+     * @param[in] num_predict_points the number of data points to predict
+     * @param[in] num_features the number of features per data point
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
+     */
+    device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        prediction_d_{ prediction_d },
+        alpha_d_{ alpha_d },
+        rho_d_{ rho_d },
+        sv_d_{ sv_d },
+        predict_points_d_{ predict_points_d },
+        num_classes_{ num_classes },
+        num_sv_{ num_sv },
+        num_predict_points_{ num_predict_points },
+        num_features_{ num_features },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] idx indices representing the current point in the execution space
+     */
+    void operator()(::sycl::item<2> idx) const {
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // calculate the indices used in the current work-item
+        const std::size_t pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t sv_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+
+        // create a work-item private array used for internal caching
+        real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
+
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim = 0; dim < num_features_; ++dim) {
+            // perform the feature reduction calculation
+            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
+                    const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+
+                    temp[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(sv_d_[dim * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx],
+                                                                                              predict_points_d_[dim * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]);
+                }
+            }
+        }
+
+        // update temp using the respective kernel function
+        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                temp[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pd][internal_sv], kernel_function_parameter_);
+            }
+        }
+
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim = 0; dim < num_classes_; ++dim) {
+            if (sv_idx == 0) {
+                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
+                    detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim] } += -rho_d_[dim];
+                }
+            }
+
+            // calculate intermediate results and store them in local memory
+            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
+                    const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+
+                    detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim] } +=
+                        temp[internal_pd][internal_sv] * alpha_d_[dim * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                }
+            }
+        }
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    real_type *prediction_d_;
+    const real_type *alpha_d_;
+    const real_type *rho_d_;
+    const real_type *sv_d_;
+    const real_type *predict_points_d_;
+    const std::size_t num_classes_;
+    const std::size_t num_sv_;
+    const std::size_t num_predict_points_;
+    const std::size_t num_features_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::tuple<Args...> kernel_function_parameter_;
+    /// @endcond
+};
+
+}  // namespace plssvm::sycl::detail::basic
+
+#endif  // PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_BASIC_PREDICT_KERNEL_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
new file mode 100644
index 000000000..4098c4914
--- /dev/null
+++ b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
@@ -0,0 +1,547 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Defines the functions used for prediction for the C-SVM using the SYCL backend and the work-group data parallel kernels.
+ */
+
+#ifndef PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_HIERARCHICAL_PREDICT_KERNEL_HPP_
+#define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_HIERARCHICAL_PREDICT_KERNEL_HPP_
+#pragma once
+
+#include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
+#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+
+#include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
+
+#include <cstddef>  // std::size_t
+#include <tuple>    // std::tuple, std::make_tuple
+
+namespace plssvm::sycl::detail::hierarchical {
+
+/**
+ * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @details Uses SYCL's hierarchical data parallel kernels.
+ */
+class device_kernel_w_linear {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in,out] w_d the vector to speedup the linear prediction
+     * @param[in] alpha_d the previously learned weights
+     * @param[in] sv_d the support vectors
+     * @param[in] num_classes the number of classes
+     * @param[in] num_sv the number of support vectors
+     * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
+     * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        w_d_{ w_d },
+        alpha_d_{ alpha_d },
+        sv_d_{ sv_d },
+        num_classes_{ num_classes },
+        num_sv_{ num_sv },
+        device_specific_num_sv_{ device_specific_num_sv },
+        sv_offset_{ sv_offset },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] group indices representing the current point in the execution space
+     */
+    void operator()(::sycl::group<2> group) const {
+        // allocate shared memory
+        real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+        // calculate the indices used in the current work-item
+        ::sycl::private_memory<std::size_t, 2> feature_idx{ group };
+        ::sycl::private_memory<std::size_t, 2> feature_idx_linear{ group };
+        ::sycl::private_memory<std::size_t, 2> class_idx{ group };
+        ::sycl::private_memory<std::size_t, 2> class_idx_linear{ group };
+
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
+
+        // initialize private and local variables
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
+            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
+            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
+            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
+            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+            // indices
+            feature_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            feature_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+            class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+            class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+
+            // initialize private temp matrix to zero
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
+                }
+            }
+        });
+
+        // implicit group barrier
+
+        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+        for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) {
+            // load data into local memory
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                const std::size_t threadIdx_x = idx.get_local_id(0);
+
+                const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    const auto global_class_idx = class_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_feature_idx = feature_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    data_cache_feature[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x];  // SoA
+                    data_cache_alpha[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x];      // AoS
+                }
+            });
+
+            // implicit group barrier
+
+            // perform the dot product calculation
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                    for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            temp(idx)[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                        }
+                    }
+                }
+            });
+
+            // implicit group barrier
+        }
+
+        // update global array with local one
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                    const auto global_class_idx = class_idx(idx) + static_cast<std::size_t>(internal_class);
+                    const auto global_feature_idx = feature_idx(idx) + static_cast<std::size_t>(internal_feature);
+
+                    w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class];
+                }
+            }
+        });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    real_type *w_d_;
+    const real_type *alpha_d_;
+    const real_type *sv_d_;
+    const std::size_t num_classes_;
+    const std::size_t num_sv_;
+    const std::size_t device_specific_num_sv_;
+    const std::size_t sv_offset_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @details Uses SYCL's hierarchical data parallel kernels.
+ */
+class device_kernel_predict_linear {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[out] prediction_d the predicted values
+     * @param[in] w_d the vector to speedup the calculations
+     * @param[in] rho_d the previously learned bias
+     * @param[in] predict_points_d the data points to predict
+     * @param[in] num_classes the number of classes
+     * @param[in] num_predict_points the number of data points to predict
+     * @param[in] num_features the number of features per data point
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        prediction_d_{ prediction_d },
+        w_d_{ w_d },
+        rho_d_{ rho_d },
+        predict_points_d_{ predict_points_d },
+        num_classes_{ num_classes },
+        num_predict_points_{ num_predict_points },
+        num_features_{ num_features },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] group indices representing the current point in the execution space
+     */
+    void operator()(::sycl::group<2> group) const {
+        // allocate shared memory
+        real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+        // calculate the indices used in the current work-item
+        ::sycl::private_memory<std::size_t, 2> pp_idx{ group };
+        ::sycl::private_memory<std::size_t, 2> pp_idx_linear{ group };
+        ::sycl::private_memory<std::size_t, 2> class_idx{ group };
+        ::sycl::private_memory<std::size_t, 2> class_idx_linear{ group };
+
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
+
+        // initialize private and local variables
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
+            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
+            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
+            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
+            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+            // indices
+            pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+            class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+            class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+
+            // initialize private temp matrix to zero
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
+                }
+            }
+        });
+
+        // implicit group barrier
+
+        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                const std::size_t threadIdx_x = idx.get_local_id(0);
+
+                const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                // load data into shared memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_class_idx = class_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                    data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
+                    data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
+                    data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
+                    data_cache_w[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
+                }
+            });
+
+            // implicit group barrier
+
+            // perform the dot product calculation
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd];
+                        }
+                    }
+                }
+            });
+
+            // implicit group barrier
+        }
+
+        // update global array with local one
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                    const auto global_class_idx = class_idx(idx) + static_cast<std::size_t>(internal_class);
+                    const auto global_pp_idx = pp_idx(idx) + static_cast<std::size_t>(internal_pd);
+
+                    prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pd][internal_class] - rho_d_[global_class_idx];
+                }
+            }
+        });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    real_type *prediction_d_;
+    const real_type *w_d_;
+    const real_type *rho_d_;
+    const real_type *predict_points_d_;
+    const std::size_t num_classes_;
+    const std::size_t num_predict_points_;
+    const std::size_t num_features_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
+ */
+template <kernel_function_type kernel_function, typename... Args>
+class device_kernel_predict {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] prediction_d the predicted values
+     * @param[in] alpha_d the previously learned weights
+     * @param[in] rho_d the previously learned biases
+     * @param[in] sv_d the support vectors
+     * @param[in] predict_points_d the data points to predict
+     * @param[in] num_classes the number of classes
+     * @param[in] num_sv the number of support vectors
+     * @param[in] num_predict_points the number of data points to predict
+     * @param[in] num_features the number of features per data point
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
+     */
+    device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        prediction_d_{ prediction_d },
+        alpha_d_{ alpha_d },
+        rho_d_{ rho_d },
+        sv_d_{ sv_d },
+        predict_points_d_{ predict_points_d },
+        num_classes_{ num_classes },
+        num_sv_{ num_sv },
+        num_predict_points_{ num_predict_points },
+        num_features_{ num_features },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @param[in] group indices representing the current point in the execution space
+     */
+    void operator()(::sycl::group<2> group) const {
+        // allocate shared memory
+        real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+        // calculate the indices used in the current work-item
+        ::sycl::private_memory<std::size_t, 2> pp_idx{ group };
+        ::sycl::private_memory<std::size_t, 2> pp_idx_linear{ group };
+        ::sycl::private_memory<std::size_t, 2> sv_idx_linear{ group };
+
+        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
+
+        // initialize private and local variables
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
+            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
+            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
+            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+
+            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+            // indices
+            pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+            sv_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+
+            // initialize private temp matrix to zero
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
+                }
+            }
+        });
+
+        // implicit group barrier
+
+        {
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                    const std::size_t threadIdx_x = idx.get_local_id(0);
+
+                    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                    // load data into local memory
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                        data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
+                        data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
+                        data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                        data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                    }
+                });
+
+                // implicit group barrier
+
+                // perform the feature reduction calculation
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                    for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                temp(idx)[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                               data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]);
+                            }
+                        }
+                    }
+                });
+
+                // implicit group barrier
+            }
+        }
+
+        // update temp using the respective kernel function
+        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                    temp(idx)[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_pd][internal_sv], kernel_function_parameter_);
+                }
+            }
+        });
+
+        // implicit group barrier
+
+        {
+            // rename cached arrays -> can't rename the arrays due to AdaptiveCpp runtime exception
+            // auto &alpha_cache = data_cache_pp;
+            // auto &out_cache = data_cache_sv;
+
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                // load data into local memory
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                    const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+                    const std::size_t threadIdx_x = idx.get_local_id(0);
+
+                    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                        data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                        data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+
+                        // the bias (rho) must only be applied once for all support vectors
+                        if (blockIdx_x == std::size_t{ 0 }) {
+                            data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x];
+                            data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
+                        } else {
+                            data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
+                            data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
+                        }
+                    }
+                });
+
+                // implicit group barrier
+
+                // calculate intermediate results and store them in local memory
+                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                    group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                data_cache_sv[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] +=
+                                    temp(idx)[internal_pd][internal_sv] * data_cache_pp[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
+                            }
+                        }
+                    });
+
+                    // implicit group barrier
+                }
+
+                // add intermediate cached results to prediction_d
+                group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
+                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
+
+                    const std::size_t threadIdx_x = idx.get_local_id(0);
+
+                    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        const auto global_pp_idx = pp_idx(idx) + static_cast<std::size_t>(internal);
+
+                        detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
+                        detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1];
+                    }
+                });
+
+                // implicit group barrier
+            }
+        }
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    real_type *prediction_d_;
+    const real_type *alpha_d_;
+    const real_type *rho_d_;
+    const real_type *sv_d_;
+    const real_type *predict_points_d_;
+    const std::size_t num_classes_;
+    const std::size_t num_sv_;
+    const std::size_t num_predict_points_;
+    const std::size_t num_features_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::tuple<Args...> kernel_function_parameter_;
+    /// @endcond
+};
+
+}  // namespace plssvm::sycl::detail::hierarchical
+
+#endif  // PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_HIERARCHICAL_PREDICT_KERNEL_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
new file mode 100644
index 000000000..1a42161f5
--- /dev/null
+++ b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
@@ -0,0 +1,498 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Defines the functions used for prediction for the C-SVM using the SYCL backend and AdaptiveCpp's scoped parallelism.
+ */
+
+#ifndef PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_SCOPED_PREDICT_KERNEL_HPP_
+#define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_SCOPED_PREDICT_KERNEL_HPP_
+#pragma once
+
+#include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
+#include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+
+#include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
+
+#include <cstddef>  // std::size_t
+#include <tuple>    // std::tuple, std::make_tuple
+
+namespace plssvm::sycl::detail::scoped {
+
+/**
+ * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @details Uses AdaptiveCpp's scoped parallelism.
+ */
+class device_kernel_w_linear {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in,out] w_d the vector to speedup the linear prediction
+     * @param[in] alpha_d the previously learned weights
+     * @param[in] sv_d the support vectors
+     * @param[in] num_classes the number of classes
+     * @param[in] num_sv the number of support vectors
+     * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
+     * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        w_d_{ w_d },
+        alpha_d_{ alpha_d },
+        sv_d_{ sv_d },
+        num_classes_{ num_classes },
+        num_sv_{ num_sv },
+        device_specific_num_sv_{ device_specific_num_sv },
+        sv_offset_{ sv_offset },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @tparam T the implementation defined type of the group to iterate
+     * @param[in] group group representing the current point in the execution space
+     */
+    template <typename T>
+    void operator()(T group) const {
+        ::sycl::memory_environment(group,
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &data_cache_feature, auto &data_cache_alpha, auto &feature_idx, auto &feature_idx_linear, auto &class_idx, auto &class_idx_linear, auto &temp) {
+                                       // initialize private and local variables
+                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
+                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
+                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
+                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
+                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
+                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
+
+                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+                                           // indices
+                                           feature_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                                           feature_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                           class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+                                           class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                       });
+
+                                       // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+                                       for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) {
+                                           // load data into local memory
+                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                               const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+
+                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                               for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                   const auto global_class_idx = class_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   const auto global_feature_idx = feature_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                   data_cache_feature[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x];  // SoA
+                                                   data_cache_alpha[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x];      // AoS
+                                               }
+                                           });
+
+                                           // perform the dot product calculation
+                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                               const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                               for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                                                   for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                                                       for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                                           temp(idx)[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                                                       }
+                                                   }
+                                               }
+                                           });
+                                       }
+
+                                       // update global array with local one
+                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                                               for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                                   const auto global_class_idx = class_idx(idx) + static_cast<std::size_t>(internal_class);
+                                                   const auto global_feature_idx = feature_idx(idx) + static_cast<std::size_t>(internal_feature);
+
+                                                   w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class];
+                                               }
+                                           }
+                                       });
+                                   });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    real_type *w_d_;
+    const real_type *alpha_d_;
+    const real_type *sv_d_;
+    const std::size_t num_classes_;
+    const std::size_t num_sv_;
+    const std::size_t device_specific_num_sv_;
+    const std::size_t sv_offset_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @details Uses AdaptiveCpp's scoped parallelism.
+ */
+class device_kernel_predict_linear {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[out] prediction_d the predicted values
+     * @param[in] w_d the vector to speedup the calculations
+     * @param[in] rho_d the previously learned bias
+     * @param[in] predict_points_d the data points to predict
+     * @param[in] num_classes the number of classes
+     * @param[in] num_predict_points the number of data points to predict
+     * @param[in] num_features the number of features per data point
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     */
+    device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        prediction_d_{ prediction_d },
+        w_d_{ w_d },
+        rho_d_{ rho_d },
+        predict_points_d_{ predict_points_d },
+        num_classes_{ num_classes },
+        num_predict_points_{ num_predict_points },
+        num_features_{ num_features },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @tparam T the implementation defined type of the group to iterate
+     * @param[in] group group representing the current point in the execution space
+     */
+    template <typename T>
+    void operator()(T group) const {
+        ::sycl::memory_environment(group,
+                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &data_cache_pp, auto &data_cache_w, auto &pp_idx, auto &pp_idx_linear, auto &class_idx, auto &class_idx_linear, auto &temp) {
+                                       // initialize private and local variables
+                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
+                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
+                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
+                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
+                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
+                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
+
+                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+                                           // indices
+                                           pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                                           pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                           class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+                                           class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                       });
+
+                                       // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+                                       for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                               const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+
+                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                               // load data into shared memory
+                                               for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                   const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   const auto global_class_idx = class_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                   // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                                                   data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
+                                                   data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
+                                                   data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
+                                                   data_cache_w[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
+                                               }
+                                           });
+
+                                           // perform the dot product calculation
+                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                               const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                               for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                                   for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                                                       for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                                           temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd];
+                                                       }
+                                                   }
+                                               }
+                                           });
+                                       }
+
+                                       // update global array with local one
+                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                                               for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                                   const auto global_class_idx = class_idx(idx) + static_cast<std::size_t>(internal_class);
+                                                   const auto global_pp_idx = pp_idx(idx) + static_cast<std::size_t>(internal_pd);
+
+                                                   prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pd][internal_class] - rho_d_[global_class_idx];
+                                               }
+                                           }
+                                       });
+                                   });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    real_type *prediction_d_;
+    const real_type *w_d_;
+    const real_type *rho_d_;
+    const real_type *predict_points_d_;
+    const std::size_t num_classes_;
+    const std::size_t num_predict_points_;
+    const std::size_t num_features_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    /// @endcond
+};
+
+/**
+ * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
+ */
+template <kernel_function_type kernel_function, typename... Args>
+class device_kernel_predict {
+  public:
+    /**
+     * @brief Initialize the SYCL kernel function object.
+     * @param[in] prediction_d the predicted values
+     * @param[in] alpha_d the previously learned weights
+     * @param[in] rho_d the previously learned biases
+     * @param[in] sv_d the support vectors
+     * @param[in] predict_points_d the data points to predict
+     * @param[in] num_classes the number of classes
+     * @param[in] num_sv the number of support vectors
+     * @param[in] num_predict_points the number of data points to predict
+     * @param[in] num_features the number of features per data point
+     * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
+     * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
+     * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
+     */
+    device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        prediction_d_{ prediction_d },
+        alpha_d_{ alpha_d },
+        rho_d_{ rho_d },
+        sv_d_{ sv_d },
+        predict_points_d_{ predict_points_d },
+        num_classes_{ num_classes },
+        num_sv_{ num_sv },
+        num_predict_points_{ num_predict_points },
+        num_features_{ num_features },
+        grid_x_offset_{ grid_x_offset },
+        grid_y_offset_{ grid_y_offset },
+        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+
+    /**
+     * @brief Function call operator overload performing the actual calculation.
+     * @tparam T the implementation defined type of the group to iterate
+     * @param[in] group group representing the current point in the execution space
+     */
+    template <typename T>
+    void operator()(T group) const {
+        ::sycl::memory_environment(group,
+                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::size_t>(),
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &data_cache_pp, auto &data_cache_sv, auto &pp_idx, auto &pp_idx_linear, auto &sv_idx_linear, auto &temp) {
+                                       // initialize private and local variables
+                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
+                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
+                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
+                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
+                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
+
+                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+                                           // indices
+                                           pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                                           pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                           sv_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                       });
+
+                                       // iterate over all features using blocking to be able to cache them for faster memory accesses
+                                       for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                               const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+
+                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                               // load data into local memory
+                                               for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                   const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   const auto global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                   // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                                                   data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
+                                                   data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
+                                                   data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                                                   data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                                               }
+                                           });
+
+                                           // perform the feature reduction calculation
+                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                               const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                               for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                                   for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                                                       for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                                           temp(idx)[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                                                          data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]);
+                                                       }
+                                                   }
+                                               }
+                                           });
+                                       }
+
+                                       // update temp using the respective kernel function
+                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                           for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                                               for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                                   temp(idx)[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_pd][internal_sv], kernel_function_parameter_);
+                                               }
+                                           }
+                                       });
+
+                                       {
+                                           // rename cached arrays
+                                           auto &alpha_cache = data_cache_pp;
+                                           auto &out_cache = data_cache_sv;
+
+                                           // iterate over all features using blocking to be able to cache them for faster memory accesses
+                                           for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                               // load data into local memory
+                                               ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                   const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                   const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
+                                                   const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+
+                                                   const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                   const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                                   for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                       const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                       alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                                                       alpha_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+
+                                                       // the bias (rho) must only be applied once for all support vectors
+                                                       if (blockIdx_x == std::size_t{ 0 }) {
+                                                           out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x];
+                                                           out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
+                                                       } else {
+                                                           out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
+                                                           out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
+                                                       }
+                                                   }
+                                               });
+
+                                               // calculate intermediate results and store them in local memory
+                                               for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                                                   ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                       const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                       for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                                                           for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                                               out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] +=
+                                                                   temp(idx)[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
+                                                           }
+                                                       }
+                                                   });
+                                               }
+
+                                               // add intermediate cached results to prediction_d
+                                               ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                   const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                   const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+
+                                                   const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                   const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                                   for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                       const auto global_pp_idx = pp_idx(idx) + static_cast<std::size_t>(internal);
+
+                                                       detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
+                                                       detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1];
+                                                   }
+                                               });
+                                           }
+                                       }
+                                   });
+    }
+
+  private:
+    /// @cond Doxygen_suppress
+    real_type *prediction_d_;
+    const real_type *alpha_d_;
+    const real_type *rho_d_;
+    const real_type *sv_d_;
+    const real_type *predict_points_d_;
+    const std::size_t num_classes_;
+    const std::size_t num_sv_;
+    const std::size_t num_predict_points_;
+    const std::size_t num_features_;
+    const std::size_t grid_x_offset_;
+    const std::size_t grid_y_offset_;
+    const std::tuple<Args...> kernel_function_parameter_;
+    /// @endcond
+};
+
+}  // namespace plssvm::sycl::detail::scoped
+
+#endif  // PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_SCOPED_PREDICT_KERNEL_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
similarity index 97%
rename from include/plssvm/backends/SYCL/kernel/predict_kernel.hpp
rename to include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
index aa12069d0..d451ac7d5 100644
--- a/include/plssvm/backends/SYCL/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
@@ -6,11 +6,11 @@
  * @license This file is part of the PLSSVM project which is released under the MIT license.
  *          See the LICENSE.md file in the project root for full license information.
  *
- * @brief Defines the functions used for prediction for the C-SVM using the SYCL backend.
+ * @brief Defines the functions used for prediction for the C-SVM using the SYCL backend and the work-group data parallel kernels.
  */
 
-#ifndef PLSSVM_BACKENDS_SYCL_PREDICT_KERNEL_HPP_
-#define PLSSVM_BACKENDS_SYCL_PREDICT_KERNEL_HPP_
+#ifndef PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_WORK_GROUP_PREDICT_KERNEL_HPP_
+#define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_WORK_GROUP_PREDICT_KERNEL_HPP_
 #pragma once
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
@@ -18,15 +18,16 @@
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
-#include "sycl/sycl.hpp"  // sycl::item
+#include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
 
 #include <cstddef>  // std::size_t
 #include <tuple>    // std::tuple, std::make_tuple
 
-namespace plssvm::sycl::detail {
+namespace plssvm::sycl::detail::work_group {
 
 /**
  * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @details Uses SYCL's work-group data parallel kernels.
  */
 class device_kernel_w_linear {
   public:
@@ -140,6 +141,7 @@ class device_kernel_w_linear {
 
 /**
  * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @details Uses SYCL's work-group data parallel kernels.
  */
 class device_kernel_predict_linear {
   public:
@@ -257,6 +259,7 @@ class device_kernel_predict_linear {
 
 /**
  * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @details Uses SYCL's work-group data parallel kernels.
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
@@ -430,6 +433,6 @@ class device_kernel_predict {
     /// @endcond
 };
 
-}  // namespace plssvm::sycl::detail
+}  // namespace plssvm::sycl::detail::work_group
 
-#endif  // PLSSVM_BACKENDS_SYCL_PREDICT_KERNEL_HPP_
+#endif  // PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_WORK_GROUP_PREDICT_KERNEL_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel_invocation_types.hpp b/include/plssvm/backends/SYCL/kernel_invocation_types.hpp
index ec10b05e8..d7cec1924 100644
--- a/include/plssvm/backends/SYCL/kernel_invocation_types.hpp
+++ b/include/plssvm/backends/SYCL/kernel_invocation_types.hpp
@@ -17,6 +17,7 @@
 #include "fmt/ostream.h"  // fmt::ostream_formatter
 
 #include <iosfwd>  // forward declare std::ostream and std::istream
+#include <vector>  // std::vector
 
 namespace plssvm::sycl {
 
@@ -26,10 +27,23 @@ namespace plssvm::sycl {
 enum class kernel_invocation_type {
     /** Use the best kernel invocation type for the current SYCL implementation and target hardware platform. */
     automatic,
-    /** Use the [`nd_range` invocation type](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_invoke). */
-    nd_range
+    /** Use the [`basic` data parallel kernels](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_basic_data_parallel_kernels). */
+    basic,
+    /** Use the [`work-group` data parallel kernels](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_work_group_data_parallel_kernels). */
+    work_group,
+    /** Use the [`hierarchical` data parallel kernels](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_hierarchical_data_parallel_kernels). **Note:** deprecated in newer SYCL version, will be replaced with a "better" version in future SYCL specifications. */
+    hierarchical,
+    /** Use the AdaptiveCpp specific [`scoped` parallelism](https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/scoped-parallelism.md). */
+    scoped
 };
 
+/**
+ * @brief Return a list of all currently available SYCL kernel invocation types.
+ * @details SYCL's hierarchical and AdaptiveCpp's scoped kernel invocation type can be disabled during the CMake configuration.
+ * @return the available SYCL kernel invocation types (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::vector<kernel_invocation_type> list_available_sycl_kernel_invocation_types();
+
 /**
  * @brief Output the @p invocation type to the given output-stream @p out.
  * @param[in,out] out the output-stream to write the backend type to
diff --git a/include/plssvm/detail/cmd/parser_predict.hpp b/include/plssvm/detail/cmd/parser_predict.hpp
index 5d930aa19..073e92f6c 100644
--- a/include/plssvm/detail/cmd/parser_predict.hpp
+++ b/include/plssvm/detail/cmd/parser_predict.hpp
@@ -13,11 +13,12 @@
 #define PLSSVM_DETAIL_CMD_PARSER_PREDICT_HPP_
 #pragma once
 
-#include "plssvm/backend_types.hpp"                       // plssvm::backend_type
-#include "plssvm/backends/Kokkos/execution_space.hpp"     // plssvm::kokkos::execution_space
-#include "plssvm/backends/SYCL/implementation_types.hpp"  // plssvm::sycl::implementation_type
-#include "plssvm/mpi/communicator.hpp"                    // plssvm::mpi::communicator
-#include "plssvm/target_platforms.hpp"                    // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                          // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "fmt/base.h"     // fmt::formatter
 #include "fmt/ostream.h"  // fmt::ostream_formatter
@@ -46,6 +47,8 @@ struct parser_predict {
     /// The target platform: automatic (depending on the used backend), CPUs or GPUs from NVIDIA, AMD, or Intel.
     target_platform target{ target_platform::automatic };
 
+    /// The kernel invocation type when using SYCL as backend.
+    sycl::kernel_invocation_type sycl_kernel_invocation_type{ sycl::kernel_invocation_type::automatic };
     /// The SYCL implementation to use with `--backend sycl`.
     sycl::implementation_type sycl_implementation_type{ sycl::implementation_type::automatic };
 
diff --git a/src/main_predict.cpp b/src/main_predict.cpp
index f27ad2d2f..ee16daba2 100644
--- a/src/main_predict.cpp
+++ b/src/main_predict.cpp
@@ -129,7 +129,7 @@ int main(int argc, char *argv[]) {
             // create default csvm
             const std::unique_ptr<csvm_type> svm = [&]() {
                 if (use_sycl_as_backend) {
-                    return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type);
+                    return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type);
                 } else if (use_kokkos_as_backend) {
                     return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space);
                 } else {
diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt b/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt
index 7dc8bb824..07b18e4f4 100644
--- a/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt
@@ -18,6 +18,13 @@ option(PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP "Enables the generic SSC
 if (PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP)
     message(STATUS "Using the new AdaptiveCpp SSCP compilation flow.")
     set(ACPP_TARGETS "generic" CACHE STRING "" FORCE)
+    if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS)
+        message(
+            WARNING "Enabled SYCL's hierarchical and AdapitveCpp's scoped kernels in AdaptiveCpp while using its SSCP compilation flow. "
+                    "SSCP, however, does currently not implement these kernel invocation types resulting in a runtime exception. "
+                    "If you wish to use them, set \"PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP\" to \"OFF\" and use one of the legacy compilation flows. "
+        )
+    endif ()
 else ()
     message(STATUS "Using the old AdaptiveCpp compilation flow.")
     # reformat PLSSVM_TARGET_PLATFORMS to be usable with ACPP_TARGETS
diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
index eaf394dd2..cf37bd48a 100644
--- a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
+++ b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
@@ -8,38 +8,50 @@
 
 #include "plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp"
 
-#include "plssvm/backend_types.hpp"                                                 // plssvm::backend_type
-#include "plssvm/backends/execution_range.hpp"                                      // plssvm::detail::{dim_type, execution_range}
-#include "plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp"                   // plssvm::adaptivecpp::detail::::device_ptr
-#include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue_impl.hpp"                   // plssvm::adaptivecpp::detail::queue (PImpl implementation)
-#include "plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp"                      // plssvm::adaptivecpp::detail::{get_device_list, device_synchronize, get_adaptivecpp_version_short, get_adaptivecpp_version}
-#include "plssvm/backends/SYCL/exceptions.hpp"                                      // plssvm::adaptivecpp::backend_exception
-#include "plssvm/backends/SYCL/implementation_types.hpp"                            // plssvm::sycl::implementation_type
-#include "plssvm/backends/SYCL/kernel/cg_explicit/blas.hpp"                         // plssvm::sycl::detail::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
-#include "plssvm/backends/SYCL/kernel/cg_explicit/kernel_matrix_assembly.hpp"       // plssvm::sycl::detail::device_kernel_assembly
-#include "plssvm/backends/SYCL/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp"  // plssvm::sycl::detail::device_kernel_assembly_symm
-#include "plssvm/backends/SYCL/kernel/predict_kernel.hpp"                           // plssvm::sycl::detail::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"                         // plssvm::kernel_invocation_type
-#include "plssvm/constants.hpp"                                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
-#include "plssvm/detail/assert.hpp"                                                 // PLSSVM_ASSERT
-#include "plssvm/detail/data_distribution.hpp"                                      // plssvm::detail::{data_distribution, triangular_data_distribution, rectangular_data_distribution}
-#include "plssvm/detail/logging/log.hpp"                                            // plssvm::detail::log
-#include "plssvm/detail/logging/log_untracked.hpp"                                  // plssvm::detail::log_untracked
-#include "plssvm/detail/logging/mpi_log_untracked.hpp"                              // plssvm::detail::log_untracked
-#include "plssvm/detail/memory_size.hpp"                                            // plssvm::detail::memory_size
-#include "plssvm/detail/tracking/performance_tracker.hpp"                           // plssvm::detail::tracking::tracking_entry
-#include "plssvm/detail/utility.hpp"                                                // plssvm::detail::get_system_memory
-#include "plssvm/exceptions/exceptions.hpp"                                         // plssvm::exception
-#include "plssvm/gamma.hpp"                                                         // plssvm::gamma_type
-#include "plssvm/kernel_function_types.hpp"                                         // plssvm::kernel_type
-#include "plssvm/mpi/communicator.hpp"                                              // plssvm::mpi::communicator
-#include "plssvm/mpi/detail/information.hpp"                                        // plssvm::mpi::detail::gather_and_print_csvm_information
-#include "plssvm/parameter.hpp"                                                     // plssvm::parameter, plssvm::detail::parameter
-#include "plssvm/shape.hpp"                                                         // plssvm::shape
-#include "plssvm/target_platforms.hpp"                                              // plssvm::target_platform
-#include "plssvm/verbosity_levels.hpp"                                              // plssvm::verbosity_level
-
-#include "sycl/sycl.hpp"  // ::sycl::range, ::sycl::nd_range, ::sycl::handler, ::sycl::info::device
+#include "plssvm/backend_types.hpp"                                                              // plssvm::backend_type
+#include "plssvm/backends/execution_range.hpp"                                                   // plssvm::detail::{dim_type, execution_range}
+#include "plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp"                                // plssvm::adaptivecpp::detail::::device_ptr
+#include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue_impl.hpp"                                // plssvm::adaptivecpp::detail::queue (PImpl implementation)
+#include "plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp"                                   // plssvm::adaptivecpp::detail::{get_device_list, device_synchronize, get_adaptivecpp_version_short, get_adaptivecpp_version}
+#include "plssvm/backends/SYCL/exceptions.hpp"                                                   // plssvm::adaptivecpp::backend_exception
+#include "plssvm/backends/SYCL/implementation_types.hpp"                                         // plssvm::sycl::implementation_type
+#include "plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp"                                // plssvm::sycl::detail::basic::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
+#include "plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp"              // plssvm::sycl::detail::basic::device_kernel_assembly
+#include "plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp"                         // plssvm::sycl::detail::hierarchical::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
+#include "plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp"       // plssvm::sycl::detail::hierarchical::device_kernel_assembly
+#include "plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp"                               // plssvm::sycl::detail::scoped::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
+#include "plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp"             // plssvm::sycl::detail::scoped::device_kernel_assembly
+#include "plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp"                           // plssvm::sycl::detail::work_group::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
+#include "plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp"         // plssvm::sycl::detail::work_group::device_kernel_assembly
+#include "plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp"         // plssvm::sycl::detail::basic::device_kernel_assembly_symm
+#include "plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp"  // plssvm::sycl::detail::hierarchical::device_kernel_assembly_symm
+#include "plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp"        // plssvm::sycl::detail::scoped::device_kernel_assembly_symm
+#include "plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp"    // plssvm::sycl::detail::work_group::device_kernel_assembly_symm
+#include "plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp"                          // plssvm::sycl::detail::basic::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
+#include "plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp"                   // plssvm::sycl::detail::hierarchical::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
+#include "plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp"                         // plssvm::sycl::detail::scoped::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
+#include "plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp"                     // plssvm::sycl::detail::work_group::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"                                      // plssvm::kernel_invocation_type
+#include "plssvm/constants.hpp"                                                                  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/detail/assert.hpp"                                                              // PLSSVM_ASSERT
+#include "plssvm/detail/data_distribution.hpp"                                                   // plssvm::detail::{data_distribution, triangular_data_distribution, rectangular_data_distribution}
+#include "plssvm/detail/logging/log.hpp"                                                         // plssvm::detail::log
+#include "plssvm/detail/logging/log_untracked.hpp"                                               // plssvm::detail::log_untracked
+#include "plssvm/detail/logging/mpi_log_untracked.hpp"                                           // plssvm::detail::log_untracked
+#include "plssvm/detail/memory_size.hpp"                                                         // plssvm::detail::memory_size
+#include "plssvm/detail/tracking/performance_tracker.hpp"                                        // plssvm::detail::tracking::tracking_entry
+#include "plssvm/detail/utility.hpp"                                                             // plssvm::detail::get_system_memory
+#include "plssvm/exceptions/exceptions.hpp"                                                      // plssvm::exception
+#include "plssvm/gamma.hpp"                                                                      // plssvm::gamma_type
+#include "plssvm/kernel_function_types.hpp"                                                      // plssvm::kernel_type
+#include "plssvm/mpi/communicator.hpp"                                                           // plssvm::mpi::communicator
+#include "plssvm/mpi/detail/information.hpp"                                                     // plssvm::mpi::detail::gather_and_print_csvm_information
+#include "plssvm/parameter.hpp"                                                                  // plssvm::parameter, plssvm::detail::parameter
+#include "plssvm/shape.hpp"                                                                      // plssvm::shape
+#include "plssvm/target_platforms.hpp"                                                           // plssvm::target_platform
+#include "plssvm/verbosity_levels.hpp"                                                           // plssvm::verbosity_level
+
+#include "sycl/sycl.hpp"  // sycl::range, sycl::nd_range, sycl::handler, sycl::info::device
 
 #include "fmt/color.h"   // fmt::fg, fmt::color::orange
 #include "fmt/format.h"  // fmt::format
@@ -96,12 +108,12 @@ void csvm::init(const target_platform target) {
 
     // set correct kernel invocation type if "automatic" has been provided
     if (invocation_type_ == sycl::kernel_invocation_type::automatic) {
-        // always use nd_range for AdaptiveCpp
-        invocation_type_ = sycl::kernel_invocation_type::nd_range;
-        if (target_ == target_platform::cpu) {
+        // always use work_group for AdaptiveCpp
+        invocation_type_ = sycl::kernel_invocation_type::work_group;
+        if (target_ == target_platform::cpu) {  // TODO: set to hierarchical or scoped?!
 #if !defined(__ACPP_USE_ACCELERATED_CPU__) && defined(__ACPP_ENABLE_OMPHOST_TARGET__)
             plssvm::detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                          "WARNING: the AdaptiveCpp automatic target for the CPU is set to nd_range, but AdaptiveCpp hasn't been build with the \"omp.accelerated\" compilation flow resulting in major performance losses!\n");
+                                          "WARNING: the AdaptiveCpp automatic target for the CPU is set to work_group, but AdaptiveCpp hasn't been build with the \"omp.accelerated\" compilation flow resulting in major performance losses!\n");
 #endif
         }
     }
@@ -120,7 +132,7 @@ void csvm::init(const target_platform target) {
         // use more detailed single rank command line output
         plssvm::detail::log_untracked(verbosity_level::full,
                                       comm_,
-                                      "\nUsing AdaptiveCpp ({}; {}) as SYCL backend with the kernel invocation type \"{}\" for the svm_kernel.\n",
+                                      "\nUsing AdaptiveCpp ({}; {}) as SYCL backend with the kernel invocation type \"{}\".\n",
                                       detail::get_adaptivecpp_version_short(),
                                       PLSSVM_ACPP_TARGETS,
                                       invocation_type_);
@@ -242,50 +254,273 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block);
-
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        // convert execution range partial_grid to SYCL's native range<2>
-        const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block;
-
-        const ::sycl::nd_range native_exec{ native_partial_grid, native_block };
-
         switch (params.kernel_type) {
+            //***************************************************//
+            //               linear kernel function              //
+            //***************************************************//
             case kernel_function_type::linear:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(native_exec, sycl::detail::device_kernel_assembly<kernel_function_type::linear>{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             sycl::detail::basic::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             sycl::detail::work_group::device_kernel_assembly<kernel_function_type::linear>{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //             polynomial kernel function            //
+            //***************************************************//
             case kernel_function_type::polynomial:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //            radial-basis kernel function           //
+            //***************************************************//
             case kernel_function_type::rbf:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::rbf, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::rbf, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::rbf, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::rbf, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //              sigmoid kernel function              //
+            //***************************************************//
             case kernel_function_type::sigmoid:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //             laplacian kernel function             //
+            //***************************************************//
             case kernel_function_type::laplacian:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //            chi-squared kernel function            //
+            //***************************************************//
             case kernel_function_type::chi_squared:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
         }
     }
@@ -304,35 +539,85 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
     // get the offset of the data points this device is responsible for
     const std::size_t row_offset = data_distribution_->place_row_offset(device_id);
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block);
-
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        // convert execution range partial_grid to SYCL's native range<2>
-        const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block;
-
-        const ::sycl::nd_range native_exec{ native_partial_grid, native_block };
-
-        device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-            cgh.parallel_for(native_exec, sycl::detail::device_kernel_symm{ cgh, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-        });
+        switch (invocation_type_) {
+            case sycl::kernel_invocation_type::automatic:
+                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+            case sycl::kernel_invocation_type::basic:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                     sycl::detail::basic::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::work_group:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                     sycl::detail::work_group::device_kernel_symm{ cgh, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
+                });
+#else
+                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                break;
+            case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                    cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
+                });
+#else
+                throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                break;
+        }
     }
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_mirror_block = detail::dim_type_to_native<2>(mirror_exec.block);
-
     for (const auto &[partial_grid, offsets] : mirror_exec.grids) {
         const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
 
         if (num_mirror_rows > 0) {
-            // convert execution range partial_grid to SYCL's native range<2>
-            const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_mirror_block;
-
-            const ::sycl::nd_range native_exec{ native_partial_grid, native_mirror_block };
-
-            device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                cgh.parallel_for(native_exec, sycl::detail::device_kernel_symm_mirror{ cgh, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-            });
+            switch (invocation_type_) {
+                case sycl::kernel_invocation_type::automatic:
+                    throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                case sycl::kernel_invocation_type::basic:
+                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                        cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, mirror_exec.block),
+                                         sycl::detail::basic::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
+                    });
+                    break;
+                case sycl::kernel_invocation_type::work_group:
+                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                        cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, mirror_exec.block),
+                                         sycl::detail::work_group::device_kernel_symm_mirror{ cgh, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
+                    });
+                    break;
+                case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                        const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, mirror_exec.block);
+                        cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
+                    });
+#else
+                    throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                    break;
+                case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                        const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, mirror_exec.block);
+                        cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
+                    });
+#else
+                    throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                    break;
+            }
         }
     }
     detail::device_synchronize(device);
@@ -342,16 +627,43 @@ void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plss
     const std::size_t num_rhs = lhs_d.shape().x;
     const queue_type &device = devices_[device_id];
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block);
-
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        // convert execution range partial_grid to SYCL's native range<2>
-        const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block;
-
-        const ::sycl::nd_range native_exec{ native_partial_grid, native_block };
-
-        device.impl->sycl_queue.parallel_for(native_exec, sycl::detail::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets.y, offsets.x });
+        switch (invocation_type_) {
+            case sycl::kernel_invocation_type::automatic:
+                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+            case sycl::kernel_invocation_type::basic:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                     sycl::detail::basic::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::work_group:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                     sycl::detail::work_group::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
+                });
+#else
+                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                break;
+            case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                    cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
+                });
+#else
+                throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                break;
+        }
     }
     detail::device_synchronize(device);
 }
@@ -360,16 +672,43 @@ void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm:
     const std::size_t num_rhs = lhs_d.shape().x;
     const queue_type &device = devices_[device_id];
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block);
-
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        // convert execution range partial_grid to SYCL's native range<2>
-        const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block;
-
-        const ::sycl::nd_range native_exec{ native_partial_grid, native_block };
-
-        device.impl->sycl_queue.parallel_for(native_exec, sycl::detail::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets.y, offsets.x });
+        switch (invocation_type_) {
+            case sycl::kernel_invocation_type::automatic:
+                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+            case sycl::kernel_invocation_type::basic:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                     sycl::detail::basic::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::work_group:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                     sycl::detail::work_group::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
+                });
+#else
+                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                break;
+            case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                    cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
+                });
+#else
+                throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                break;
+        }
     }
     detail::device_synchronize(device);
 }
@@ -387,50 +726,273 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
 
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block);
-
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        // convert execution range partial_grid to SYCL's native range<2>
-        const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block;
-
-        const ::sycl::nd_range native_exec{ native_partial_grid, native_block };
-
         switch (params.kernel_type) {
+            //***************************************************//
+            //               linear kernel function              //
+            //***************************************************//
             case kernel_function_type::linear:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(native_exec, sycl::detail::device_kernel_assembly_symm<kernel_function_type::linear>{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::linear>{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //             polynomial kernel function            //
+            //***************************************************//
             case kernel_function_type::polynomial:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //            radial-basis kernel function           //
+            //***************************************************//
             case kernel_function_type::rbf:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //              sigmoid kernel function              //
+            //***************************************************//
             case kernel_function_type::sigmoid:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //             laplacian kernel function             //
+            //***************************************************//
             case kernel_function_type::laplacian:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //            chi-squared kernel function            //
+            //***************************************************//
             case kernel_function_type::chi_squared:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
         }
     }
@@ -453,18 +1015,42 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
 
     device_ptr_type w_d{ shape{ num_classes, num_features }, shape{ PADDING_SIZE, PADDING_SIZE }, device };
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block);
-
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        // convert execution range partial_grid to SYCL's native range<2>
-        const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block;
-
-        const ::sycl::nd_range native_exec{ native_partial_grid, native_block };
-
-        device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-            cgh.parallel_for(native_exec, sycl::detail::device_kernel_w_linear{ cgh, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-        });
+        switch (invocation_type_) {
+            case sycl::kernel_invocation_type::automatic:
+                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+            case sycl::kernel_invocation_type::basic:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                     sycl::detail::basic::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::work_group:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                     sycl::detail::work_group::device_kernel_w_linear{ cgh, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
+                });
+#else
+                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                break;
+            case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                    cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
+                });
+#else
+                throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+        }
     }
     detail::device_synchronize(device);
 
@@ -480,50 +1066,273 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
 
     device_ptr_type out_d{ shape{ num_predict_points, num_classes }, shape{ PADDING_SIZE, PADDING_SIZE }, device };
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block);
-
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        // convert execution range partial_grid to SYCL's native range<2>
-        const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block;
-
-        const ::sycl::nd_range native_exec{ native_partial_grid, native_block };
-
         switch (params.kernel_type) {
+            //***************************************************//
+            //               linear kernel function              //
+            //***************************************************//
             case kernel_function_type::linear:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(native_exec, sycl::detail::device_kernel_predict_linear{ cgh, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             sycl::detail::basic::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             sycl::detail::work_group::device_kernel_predict_linear{ cgh, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //             polynomial kernel function            //
+            //***************************************************//
             case kernel_function_type::polynomial:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //            radial-basis kernel function           //
+            //***************************************************//
             case kernel_function_type::rbf:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::rbf, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::rbf, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::rbf, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::rbf, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //              sigmoid kernel function              //
+            //***************************************************//
             case kernel_function_type::sigmoid:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //             laplacian kernel function             //
+            //***************************************************//
             case kernel_function_type::laplacian:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::laplacian, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::laplacian, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::laplacian, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::laplacian, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
+            //***************************************************//
+            //            chi-squared kernel function            //
+            //***************************************************//
             case kernel_function_type::chi_squared:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+                        break;
+                }
                 break;
         }
     }
diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt
index b7a0fb119..d0de8c7f2 100644
--- a/src/plssvm/backends/SYCL/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/CMakeLists.txt
@@ -20,6 +20,12 @@ set(PLSSVM_SYCL_BACKEND_FOUND_IMPLEMENTATIONS "")
 #                                             check if SYCL can be enabled                                             #
 ########################################################################################################################
 
+# enable kernel invocation types
+option(PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS "Enables SYCL's hierarchical and AdaptiveCpp's scoped kernel invocation types." ON)
+if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS)
+    message(STATUS "Enable SYCL's hierarchical and AdaptiveCpp's scoped kernel invocation types.")
+endif ()
+
 # add AdaptiveCpp
 set(PLSSVM_ENABLE_SYCL_ADAPTIVECPP_BACKEND ${PLSSVM_ENABLE_SYCL_BACKEND} CACHE STRING "Enable AdaptiveCpp as SYCL Backend")
 set_property(CACHE PLSSVM_ENABLE_SYCL_ADAPTIVECPP_BACKEND PROPERTY STRINGS AUTO ON OFF)
@@ -108,6 +114,24 @@ if (TARGET ${PLSSVM_SYCL_BACKEND_DPCPP_LIBRARY_NAME})
     )
 endif ()
 
+# add kernel invocation type compile definitions
+if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS)
+    target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} INTERFACE PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+    target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+    if (TARGET ${PLSSVM_SYCL_BACKEND_ADAPTIVECPP_LIBRARY_NAME})
+        target_compile_definitions(${PLSSVM_SYCL_BACKEND_ADAPTIVECPP_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+    endif ()
+    if (TARGET ${PLSSVM_SYCL_BACKEND_DPCPP_LIBRARY_NAME})
+        target_compile_definitions(${PLSSVM_SYCL_BACKEND_DPCPP_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+    endif ()
+
+    # create name list
+    set(PLSSVM_SYCL_KERNEL_HIERARCHICAL_AND_SCOPED_NAME_LIST ";hierarchical")
+    if (TARGET ${PLSSVM_SYCL_BACKEND_ADAPTIVECPP_LIBRARY_NAME})
+        list(APPEND PLSSVM_SYCL_KERNEL_HIERARCHICAL_AND_SCOPED_NAME_LIST "scoped")
+    endif ()
+endif ()
+
 # link against interface library
 target_link_libraries(${PLSSVM_ALL_LIBRARY_NAME} INTERFACE ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME})
 
@@ -117,8 +141,14 @@ target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PRIVATE PLSSVM_HAS_SYCL_B
 # mark backend library as install target
 append_local_and_parent(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME})
 
-# set manpage string
+# set manpage strings
 set_local_and_parent(PLSSVM_SYCL_BACKEND_NAME_LIST "automatic;${PLSSVM_SYCL_BACKEND_FOUND_IMPLEMENTATIONS}")
+set_local_and_parent(PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST "automatic;basic;work_group${PLSSVM_SYCL_KERNEL_HIERARCHICAL_AND_SCOPED_NAME_LIST}")
+
+# populate transformed ACPP_TARGETS for tests
+if (TARGET ${PLSSVM_SYCL_BACKEND_ADAPTIVECPP_LIBRARY_NAME})
+    set_local_and_parent(ACPP_TARGETS "${ACPP_TARGETS}")
+endif ()
 
 # populate transformed ACPP_TARGETS for tests
 if (TARGET ${PLSSVM_SYCL_BACKEND_ADAPTIVECPP_LIBRARY_NAME})
diff --git a/src/plssvm/backends/SYCL/DPCPP/CMakeLists.txt b/src/plssvm/backends/SYCL/DPCPP/CMakeLists.txt
index d3e53ba83..b52eb5d25 100644
--- a/src/plssvm/backends/SYCL/DPCPP/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/DPCPP/CMakeLists.txt
@@ -25,6 +25,14 @@ if (PLSSVM_SYCL_BACKEND_CHECK_FOR_DPCPP_COMPILER)
     message(CHECK_PASS "found")
     append_local_and_parent(PLSSVM_SYCL_BACKEND_FOUND_IMPLEMENTATIONS "dpcpp")
 
+    # check if the hierarchical kernel is enabled while the build type is Debug -> throw a warning
+    if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS AND uppercase_CMAKE_BUILD_TYPE MATCHES DEBUG)
+        message(WARNING "Enabled SYCL's hierarchical kernels in DPC++ while using Debug as build type. "
+                        "This may result in compilation errors during PTX code generation. "
+                        "If this is the case, either set \"PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS\" to \"OFF\" or use another build type."
+        )
+    endif ()
+
     # set DPC++ specific targets
     set(PLSSVM_SYCL_DPCPP_SOURCES ${CMAKE_CURRENT_LIST_DIR}/detail/device_ptr.cpp ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp
                                   ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp
diff --git a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
index 2d3c85ec8..34dbf083f 100644
--- a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
+++ b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
@@ -8,36 +8,44 @@
 
 #include "plssvm/backends/SYCL/DPCPP/csvm.hpp"
 
-#include "plssvm/backend_types.hpp"                                                 // plssvm::backend_type
-#include "plssvm/backends/execution_range.hpp"                                      // plssvm::detail::{dim_type, execution_range}
-#include "plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp"                         // plssvm::dpcpp::detail::::device_ptr
-#include "plssvm/backends/SYCL/DPCPP/detail/queue_impl.hpp"                         // plssvm::dpcpp::detail::queue (PImpl implementation)
-#include "plssvm/backends/SYCL/DPCPP/detail/utility.hpp"                            // plssvm::dpcpp::detail::{get_device_list, device_synchronize, get_dpcpp_version}
-#include "plssvm/backends/SYCL/exceptions.hpp"                                      // plssvm::dpcpp::backend_exception
-#include "plssvm/backends/SYCL/implementation_types.hpp"                            // plssvm::sycl::implementation_type
-#include "plssvm/backends/SYCL/kernel/cg_explicit/blas.hpp"                         // plssvm::sycl::detail::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
-#include "plssvm/backends/SYCL/kernel/cg_explicit/kernel_matrix_assembly.hpp"       // plssvm::sycl::detail::device_kernel_assembly
-#include "plssvm/backends/SYCL/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp"  // plssvm::sycl::detail::device_kernel_assembly_symm
-#include "plssvm/backends/SYCL/kernel/predict_kernel.hpp"                           // plssvm::sycl::detail::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"                         // plssvm::kernel_invocation_type
-#include "plssvm/constants.hpp"                                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
-#include "plssvm/detail/assert.hpp"                                                 // PLSSVM_ASSERT
-#include "plssvm/detail/data_distribution.hpp"                                      // plssvm::detail::{data_distribution, triangular_data_distribution, rectangular_data_distribution}
-#include "plssvm/detail/logging/log_untracked.hpp"                                  // plssvm::detail::log_untracked
-#include "plssvm/detail/logging/mpi_log_untracked.hpp"                              // plssvm::detail::log_untracked
-#include "plssvm/detail/memory_size.hpp"                                            // plssvm::detail::memory_size
-#include "plssvm/detail/tracking/performance_tracker.hpp"                           // plssvm::detail::tracking::tracking_entry
-#include "plssvm/exceptions/exceptions.hpp"                                         // plssvm::exception
-#include "plssvm/gamma.hpp"                                                         // plssvm::gamma_type
-#include "plssvm/kernel_function_types.hpp"                                         // plssvm::kernel_type
-#include "plssvm/mpi/communicator.hpp"                                              // plssvm::mpi::communicator
-#include "plssvm/mpi/detail/information.hpp"                                        // plssvm::mpi::detail::gather_and_print_csvm_information
-#include "plssvm/parameter.hpp"                                                     // plssvm::parameter
-#include "plssvm/shape.hpp"                                                         // plssvm::shape
-#include "plssvm/target_platforms.hpp"                                              // plssvm::target_platform
-#include "plssvm/verbosity_levels.hpp"                                              // plssvm::verbosity_level
-
-#include "sycl/sycl.hpp"  // sycl::queue, sycl::range, sycl::nd_range, sycl::handler, sycl::info::device
+#include "plssvm/backend_types.hpp"                                                              // plssvm::backend_type
+#include "plssvm/backends/execution_range.hpp"                                                   // plssvm::detail::{dim_type, execution_range}
+#include "plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp"                                      // plssvm::dpcpp::detail::::device_ptr
+#include "plssvm/backends/SYCL/DPCPP/detail/queue_impl.hpp"                                      // plssvm::dpcpp::detail::queue (PImpl implementation)
+#include "plssvm/backends/SYCL/DPCPP/detail/utility.hpp"                                         // plssvm::dpcpp::detail::{get_device_list, device_synchronize, get_dpcpp_version}
+#include "plssvm/backends/SYCL/exceptions.hpp"                                                   // plssvm::dpcpp::backend_exception
+#include "plssvm/backends/SYCL/implementation_types.hpp"                                         // plssvm::sycl::implementation_type
+#include "plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp"                                // plssvm::sycl::detail::basic::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
+#include "plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp"              // plssvm::sycl::detail::basic::device_kernel_assembly
+#include "plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp"                         // plssvm::sycl::detail::hierarchical::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
+#include "plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp"       // plssvm::sycl::detail::hierarchical::device_kernel_assembly
+#include "plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp"                           // plssvm::sycl::detail::work_group::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
+#include "plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp"         // plssvm::sycl::detail::work_group::device_kernel_assembly
+#include "plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp"         // plssvm::sycl::detail::basic::device_kernel_assembly_symm
+#include "plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp"  // plssvm::sycl::detail::hierarchical::device_kernel_assembly_symm
+#include "plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp"    // plssvm::sycl::detail::work_group::device_kernel_assembly_symm
+#include "plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp"                          // plssvm::sycl::detail::basic::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
+#include "plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp"                   // plssvm::sycl::detail::hierarchical::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
+#include "plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp"                     // plssvm::sycl::detail::work_group::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"                                      // plssvm::kernel_invocation_type
+#include "plssvm/constants.hpp"                                                                  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/detail/assert.hpp"                                                              // PLSSVM_ASSERT
+#include "plssvm/detail/data_distribution.hpp"                                                   // plssvm::detail::{data_distribution, triangular_data_distribution, rectangular_data_distribution}
+#include "plssvm/detail/logging/log_untracked.hpp"                                               // plssvm::detail::log_untracked
+#include "plssvm/detail/logging/mpi_log_untracked.hpp"                                           // plssvm::detail::log_untracked
+#include "plssvm/detail/memory_size.hpp"                                                         // plssvm::detail::memory_size
+#include "plssvm/detail/tracking/performance_tracker.hpp"                                        // plssvm::detail::tracking::tracking_entry
+#include "plssvm/exceptions/exceptions.hpp"                                                      // plssvm::exception
+#include "plssvm/gamma.hpp"                                                                      // plssvm::gamma_type
+#include "plssvm/kernel_function_types.hpp"                                                      // plssvm::kernel_type
+#include "plssvm/mpi/communicator.hpp"                                                           // plssvm::mpi::communicator
+#include "plssvm/mpi/detail/information.hpp"                                                     // plssvm::mpi::detail::gather_and_print_csvm_information
+#include "plssvm/parameter.hpp"                                                                  // plssvm::parameter
+#include "plssvm/shape.hpp"                                                                      // plssvm::shape
+#include "plssvm/target_platforms.hpp"                                                           // plssvm::target_platform
+#include "plssvm/verbosity_levels.hpp"                                                           // plssvm::verbosity_level
+
+#include "sycl/sycl.hpp"  // sycl::range, sycl::nd_range, sycl::handler, sycl::info::device
 
 #include "fmt/color.h"   // fmt::fg, fmt::color::orange
 #include "fmt/format.h"  // fmt::format
@@ -94,8 +102,8 @@ void csvm::init(const target_platform target) {
 
     // set correct kernel invocation type if "automatic" has been provided
     if (invocation_type_ == sycl::kernel_invocation_type::automatic) {
-        // always use nd_range for DPC++
-        invocation_type_ = sycl::kernel_invocation_type::nd_range;
+        // always use work_group for DPC++
+        invocation_type_ = sycl::kernel_invocation_type::work_group;
     }
 
     std::vector<std::string> device_names{};
@@ -112,7 +120,7 @@ void csvm::init(const target_platform target) {
         // use more detailed single rank command line output
         plssvm::detail::log_untracked(verbosity_level::full,
                                       comm_,
-                                      "\nUsing DPC++ ({}; {}) as SYCL backend with the kernel invocation type \"{}\" for the svm_kernel.\n",
+                                      "\nUsing DPC++ ({}; {}) as SYCL backend with the kernel invocation type \"{}\".\n",
                                       detail::get_dpcpp_version(),
                                       detail::get_dpcpp_timestamp_version(),
                                       invocation_type_);
@@ -159,7 +167,7 @@ csvm::~csvm() {
         for (const queue_type &q : devices_) {
             detail::device_synchronize(q);
         }
-    } catch (const plssvm::exception &e) {
+    } catch (const std::exception &e) {
         std::cout << e.what() << std::endl;
         std::terminate();
     }
@@ -229,50 +237,220 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block);
-
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        // convert execution range partial_grid to SYCL's native range<2>
-        const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block;
-
-        const ::sycl::nd_range native_exec{ native_partial_grid, native_block };
-
         switch (params.kernel_type) {
+            //***************************************************//
+            //               linear kernel function              //
+            //***************************************************//
             case kernel_function_type::linear:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(native_exec, sycl::detail::device_kernel_assembly<kernel_function_type::linear>{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             sycl::detail::basic::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             sycl::detail::work_group::device_kernel_assembly<kernel_function_type::linear>{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //             polynomial kernel function            //
+            //***************************************************//
             case kernel_function_type::polynomial:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //            radial-basis kernel function           //
+            //***************************************************//
             case kernel_function_type::rbf:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::rbf, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::rbf, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::rbf, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //              sigmoid kernel function              //
+            //***************************************************//
             case kernel_function_type::sigmoid:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //             laplacian kernel function             //
+            //***************************************************//
             case kernel_function_type::laplacian:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //            chi-squared kernel function            //
+            //***************************************************//
             case kernel_function_type::chi_squared:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
         }
     }
@@ -291,35 +469,69 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
     // get the offset of the data points this device is responsible for
     const std::size_t row_offset = data_distribution_->place_row_offset(device_id);
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block);
-
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        // convert execution range partial_grid to SYCL's native range<2>
-        const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block;
-
-        const ::sycl::nd_range native_exec{ native_partial_grid, native_block };
-
-        device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-            cgh.parallel_for(native_exec, sycl::detail::device_kernel_symm{ cgh, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-        });
+        switch (invocation_type_) {
+            case sycl::kernel_invocation_type::automatic:
+                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+            case sycl::kernel_invocation_type::basic:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                     sycl::detail::basic::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::work_group:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                     sycl::detail::work_group::device_kernel_symm{ cgh, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
+                });
+#else
+                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                break;
+            case sycl::kernel_invocation_type::scoped:
+                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+        }
     }
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_mirror_block = detail::dim_type_to_native<2>(mirror_exec.block);
-
     for (const auto &[partial_grid, offsets] : mirror_exec.grids) {
         const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
 
         if (num_mirror_rows > 0) {
-            // convert execution range partial_grid to SYCL's native range<2>
-            const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_mirror_block;
-
-            const ::sycl::nd_range native_exec{ native_partial_grid, native_mirror_block };
-
-            device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                cgh.parallel_for(native_exec, sycl::detail::device_kernel_symm_mirror{ cgh, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-            });
+            switch (invocation_type_) {
+                case sycl::kernel_invocation_type::automatic:
+                    throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                case sycl::kernel_invocation_type::basic:
+                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                        cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, mirror_exec.block),
+                                         sycl::detail::basic::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
+                    });
+                    break;
+                case sycl::kernel_invocation_type::work_group:
+                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                        cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, mirror_exec.block),
+                                         sycl::detail::work_group::device_kernel_symm_mirror{ cgh, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
+                    });
+                    break;
+                case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                        const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, mirror_exec.block);
+                        cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
+                    });
+#else
+                    throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                    break;
+                case sycl::kernel_invocation_type::scoped:
+                    throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+            }
         }
     }
     detail::device_synchronize(device);
@@ -329,16 +541,35 @@ void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plss
     const std::size_t num_rhs = lhs_d.shape().x;
     const queue_type &device = devices_[device_id];
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block);
-
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        // convert execution range partial_grid to SYCL's native range<2>
-        const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block;
-
-        const ::sycl::nd_range native_exec{ native_partial_grid, native_block };
-
-        device.impl->sycl_queue.parallel_for(native_exec, sycl::detail::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets.y, offsets.x });
+        switch (invocation_type_) {
+            case sycl::kernel_invocation_type::automatic:
+                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+            case sycl::kernel_invocation_type::basic:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                     sycl::detail::basic::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::work_group:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                     sycl::detail::work_group::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
+                });
+#else
+                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                break;
+            case sycl::kernel_invocation_type::scoped:
+                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+        }
     }
     detail::device_synchronize(device);
 }
@@ -347,16 +578,35 @@ void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm:
     const std::size_t num_rhs = lhs_d.shape().x;
     const queue_type &device = devices_[device_id];
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block);
-
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        // convert execution range partial_grid to SYCL's native range<2>
-        const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block;
-
-        const ::sycl::nd_range native_exec{ native_partial_grid, native_block };
-
-        device.impl->sycl_queue.parallel_for(native_exec, sycl::detail::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets.y, offsets.x });
+        switch (invocation_type_) {
+            case sycl::kernel_invocation_type::automatic:
+                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+            case sycl::kernel_invocation_type::basic:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                     sycl::detail::basic::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::work_group:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                     sycl::detail::work_group::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
+                });
+#else
+                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                break;
+            case sycl::kernel_invocation_type::scoped:
+                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+        }
     }
     detail::device_synchronize(device);
 }
@@ -374,50 +624,220 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
 
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block);
-
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        // convert execution range partial_grid to SYCL's native range<2>
-        const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block;
-
-        const ::sycl::nd_range native_exec{ native_partial_grid, native_block };
-
         switch (params.kernel_type) {
+            //***************************************************//
+            //               linear kernel function              //
+            //***************************************************//
             case kernel_function_type::linear:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(native_exec, sycl::detail::device_kernel_assembly_symm<kernel_function_type::linear>{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::linear>{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //             polynomial kernel function            //
+            //***************************************************//
             case kernel_function_type::polynomial:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //            radial-basis kernel function           //
+            //***************************************************//
             case kernel_function_type::rbf:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //              sigmoid kernel function              //
+            //***************************************************//
             case kernel_function_type::sigmoid:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //             laplacian kernel function             //
+            //***************************************************//
             case kernel_function_type::laplacian:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //            chi-squared kernel function            //
+            //***************************************************//
             case kernel_function_type::chi_squared:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
         }
     }
@@ -440,18 +860,35 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
 
     device_ptr_type w_d{ shape{ num_classes, num_features }, shape{ PADDING_SIZE, PADDING_SIZE }, device };
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block);
-
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        // convert execution range partial_grid to SYCL's native range<2>
-        const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block;
-
-        const ::sycl::nd_range native_exec{ native_partial_grid, native_block };
-
-        device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-            cgh.parallel_for(native_exec, sycl::detail::device_kernel_w_linear{ cgh, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-        });
+        switch (invocation_type_) {
+            case sycl::kernel_invocation_type::automatic:
+                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+            case sycl::kernel_invocation_type::basic:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                     sycl::detail::basic::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::work_group:
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                     sycl::detail::work_group::device_kernel_w_linear{ cgh, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
+                });
+                break;
+            case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
+                });
+#else
+                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                break;
+            case sycl::kernel_invocation_type::scoped:
+                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+        }
     }
     detail::device_synchronize(device);
 
@@ -467,50 +904,220 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
 
     device_ptr_type out_d{ shape{ num_predict_points, num_classes }, shape{ PADDING_SIZE, PADDING_SIZE }, device };
 
-    // convert execution range block to SYCL's native range<2>
-    const ::sycl::range native_block = detail::dim_type_to_native<2>(exec.block);
-
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        // convert execution range partial_grid to SYCL's native range<2>
-        const ::sycl::range native_partial_grid = detail::dim_type_to_native<2>(partial_grid) * native_block;
-
-        const ::sycl::nd_range native_exec{ native_partial_grid, native_block };
-
         switch (params.kernel_type) {
+            //***************************************************//
+            //               linear kernel function              //
+            //***************************************************//
             case kernel_function_type::linear:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(native_exec, sycl::detail::device_kernel_predict_linear{ cgh, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             sycl::detail::basic::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             sycl::detail::work_group::device_kernel_predict_linear{ cgh, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //             polynomial kernel function            //
+            //***************************************************//
             case kernel_function_type::polynomial:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //            radial-basis kernel function           //
+            //***************************************************//
             case kernel_function_type::rbf:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::rbf, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::rbf, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::rbf, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //              sigmoid kernel function              //
+            //***************************************************//
             case kernel_function_type::sigmoid:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //             laplacian kernel function             //
+            //***************************************************//
             case kernel_function_type::laplacian:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::laplacian, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::laplacian, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::laplacian, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
+            //***************************************************//
+            //            chi-squared kernel function            //
+            //***************************************************//
             case kernel_function_type::chi_squared:
-                device.impl->sycl_queue.submit([&, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    using functor_type = sycl::detail::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                    cgh.parallel_for(native_exec, functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                });
+                switch (invocation_type_) {
+                    case sycl::kernel_invocation_type::automatic:
+                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                    case sycl::kernel_invocation_type::basic:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::work_group:
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
+                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+                        break;
+                    case sycl::kernel_invocation_type::hierarchical:
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
+                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
+                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
+                        });
+#else
+                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+                        break;
+                    case sycl::kernel_invocation_type::scoped:
+                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                }
                 break;
         }
     }
diff --git a/src/plssvm/backends/SYCL/kernel_invocation_types.cpp b/src/plssvm/backends/SYCL/kernel_invocation_types.cpp
index 58a73ca26..87ee18f26 100644
--- a/src/plssvm/backends/SYCL/kernel_invocation_types.cpp
+++ b/src/plssvm/backends/SYCL/kernel_invocation_types.cpp
@@ -14,15 +14,37 @@
 #include <istream>  // std::istream
 #include <ostream>  // std::ostream
 #include <string>   // std::string
+#include <vector>   // std::vector
 
 namespace plssvm::sycl {
 
+std::vector<kernel_invocation_type> list_available_sycl_kernel_invocation_types() {
+    std::vector<kernel_invocation_type> available_sycl_kernel_invocation_types = {
+        kernel_invocation_type::automatic,
+        kernel_invocation_type::basic,
+        kernel_invocation_type::work_group
+    };
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+    available_sycl_kernel_invocation_types.push_back(kernel_invocation_type::hierarchical);
+    #if defined(PLSSVM_SYCL_BACKEND_HAS_ADAPTIVECPP)
+    available_sycl_kernel_invocation_types.push_back(kernel_invocation_type::scoped);
+    #endif
+#endif
+    return available_sycl_kernel_invocation_types;
+}
+
 std::ostream &operator<<(std::ostream &out, const kernel_invocation_type invocation) {
     switch (invocation) {
         case kernel_invocation_type::automatic:
             return out << "automatic";
-        case kernel_invocation_type::nd_range:
-            return out << "nd_range";
+        case kernel_invocation_type::basic:
+            return out << "basic";
+        case kernel_invocation_type::work_group:
+            return out << "work_group";
+        case kernel_invocation_type::hierarchical:
+            return out << "hierarchical";
+        case kernel_invocation_type::scoped:
+            return out << "scoped";
     }
     return out << "unknown";
 }
@@ -34,8 +56,14 @@ std::istream &operator>>(std::istream &in, kernel_invocation_type &invocation) {
 
     if (str == "automatic" || str == "auto") {
         invocation = kernel_invocation_type::automatic;
-    } else if (str == "nd_range") {
-        invocation = kernel_invocation_type::nd_range;
+    } else if (str == "basic") {
+        invocation = kernel_invocation_type::basic;
+    } else if (str == "work_group" || str == "work-group" || str == "nd_range" || str == "nd-range") {
+        invocation = kernel_invocation_type::work_group;
+    } else if (str == "hierarchical") {
+        invocation = kernel_invocation_type::hierarchical;
+    } else if (str == "scoped") {
+        invocation = kernel_invocation_type::scoped;
     } else {
         in.setstate(std::ios::failbit);
     }
diff --git a/src/plssvm/detail/cmd/parser_predict.cpp b/src/plssvm/detail/cmd/parser_predict.cpp
index c90bb62d1..ed4476e96 100644
--- a/src/plssvm/detail/cmd/parser_predict.cpp
+++ b/src/plssvm/detail/cmd/parser_predict.cpp
@@ -8,17 +8,18 @@
 
 #include "plssvm/detail/cmd/parser_predict.hpp"
 
-#include "plssvm/backend_types.hpp"                       // plssvm::list_available_backends
-#include "plssvm/backends/Kokkos/execution_space.hpp"     // plssvm::kokkos::list_available_execution_spaces
-#include "plssvm/backends/SYCL/implementation_types.hpp"  // plssvm::sycl::list_available_sycl_implementations
-#include "plssvm/constants.hpp"                           // plssvm::real_type
-#include "plssvm/detail/assert.hpp"                       // PLSSVM_ASSERT
-#include "plssvm/detail/logging/mpi_log_untracked.hpp"    // plssvm::detail::log_untracked
-#include "plssvm/exceptions/exceptions.hpp"               // plssvm::cmd_parser_exit
-#include "plssvm/mpi/communicator.hpp"                    // plssvm::mpi::communicator
-#include "plssvm/target_platforms.hpp"                    // plssvm::list_available_target_platforms
-#include "plssvm/verbosity_levels.hpp"                    // plssvm::verbosity, plssvm::verbosity_level
-#include "plssvm/version/version.hpp"                     // plssvm::version::detail::get_version_info
+#include "plssvm/backend_types.hpp"                          // plssvm::list_available_backends
+#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::list_available_execution_spaces
+#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::list_available_sycl_implementations
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::{list_available_sycl_kernel_invocation_types, kernel_invocation_type}
+#include "plssvm/constants.hpp"                              // plssvm::real_type
+#include "plssvm/detail/assert.hpp"                          // PLSSVM_ASSERT
+#include "plssvm/detail/logging/mpi_log_untracked.hpp"       // plssvm::detail::log_untracked
+#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::cmd_parser_exit
+#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
+#include "plssvm/target_platforms.hpp"                       // plssvm::list_available_target_platforms
+#include "plssvm/verbosity_levels.hpp"                       // plssvm::verbosity, plssvm::verbosity_level
+#include "plssvm/version/version.hpp"                        // plssvm::version::detail::get_version_info
 
 #include "cxxopts.hpp"   // cxxopts::{Options, value, ParseResult}
 #include "fmt/color.h"   // fmt::fg, fmt::color::orange
@@ -52,6 +53,7 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a
             ("b,backend", fmt::format("choose the backend: {}", fmt::join(list_available_backends(), "|")), cxxopts::value<backend_type>()->default_value(fmt::format("{}", backend)))
             ("p,target_platform", fmt::format("choose the target platform: {}", fmt::join(list_available_target_platforms(), "|")), cxxopts::value<target_platform>()->default_value(fmt::format("{}", target)))
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
+            ("sycl_kernel_invocation_type", fmt::format("choose the kernel invocation type when using SYCL as backend: {}", fmt::join(sycl::list_available_sycl_kernel_invocation_types(), "|")), cxxopts::value<decltype(sycl_kernel_invocation_type)>()->default_value(fmt::format("{}", sycl_kernel_invocation_type)))
             ("sycl_implementation_type", fmt::format("choose the SYCL implementation to be used in the SYCL backend: {}", fmt::join(sycl::list_available_sycl_implementations(), "|")), cxxopts::value<sycl::implementation_type>()->default_value(fmt::format("{}", sycl_implementation_type)))
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
@@ -119,13 +121,24 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a
 
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
     {
-        // parse SYCL implementation used in the SYCL backend
-        sycl_implementation_type = result["sycl_implementation_type"].as<decltype(sycl_implementation_type)>();
+        // parse kernel invocation type when using SYCL as backend
+        sycl_kernel_invocation_type = result["sycl_kernel_invocation_type"].as<decltype(sycl_kernel_invocation_type)>();
 
-        // assembly warning condition
+        // assemble warning condition
         const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
         const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl);
 
+        // warn if kernel invocation type is explicitly set but SYCL isn't the current (automatic) backend
+        if (!sycl_backend_is_used && sycl_kernel_invocation_type != sycl::kernel_invocation_type::automatic) {
+            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  comm,
+                                  "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}\n",
+                                  sycl_kernel_invocation_type);
+        }
+
+        // parse SYCL implementation used in the SYCL backend
+        sycl_implementation_type = result["sycl_implementation_type"].as<decltype(sycl_implementation_type)>();
+
         // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend
         if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) {
             detail::log_untracked(verbosity_level::full | verbosity_level::warning,
@@ -237,7 +250,11 @@ std::ostream &operator<<(std::ostream &out, const parser_predict &params) {
         params.target);
 
     if (params.backend == backend_type::sycl || params.backend == backend_type::automatic) {
-        out << fmt::format("SYCL implementation type: {}\n", params.sycl_implementation_type);
+        out << fmt::format(
+            "SYCL implementation type: {}\n"
+            "SYCL kernel invocation type: {}\n",
+            params.sycl_implementation_type,
+            params.sycl_kernel_invocation_type);
     }
 
     if (params.backend == backend_type::kokkos || params.backend == backend_type::automatic) {
diff --git a/src/plssvm/detail/cmd/parser_train.cpp b/src/plssvm/detail/cmd/parser_train.cpp
index fdb0070c9..b47422a0f 100644
--- a/src/plssvm/detail/cmd/parser_train.cpp
+++ b/src/plssvm/detail/cmd/parser_train.cpp
@@ -11,7 +11,7 @@
 #include "plssvm/backend_types.hpp"                          // plssvm::list_available_backends, plssvm::determine_default_backend
 #include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::{list_available_execution_spaces, execution_space}
 #include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::{list_available_sycl_implementations, implementation_type}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::{list_available_sycl_kernel_invocation_types, kernel_invocation_type}
 #include "plssvm/classification_types.hpp"                   // plssvm::classification_type, plssvm::classification_type_to_full_string
 #include "plssvm/constants.hpp"                              // plssvm::real_type
 #include "plssvm/detail/assert.hpp"                          // PLSSVM_ASSERT
@@ -80,7 +80,7 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
            ("b,backend", fmt::format("choose the backend: {}", fmt::join(list_available_backends(), "|")), cxxopts::value<decltype(backend)>()->default_value(fmt::format("{}", backend)))
            ("p,target_platform", fmt::format("choose the target platform: {}", fmt::join(list_available_target_platforms(), "|")), cxxopts::value<decltype(target)>()->default_value(fmt::format("{}", target)))
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-           ("sycl_kernel_invocation_type", "choose the kernel invocation type when using SYCL as backend: automatic|nd_range", cxxopts::value<decltype(sycl_kernel_invocation_type)>()->default_value(fmt::format("{}", sycl_kernel_invocation_type)))
+           ("sycl_kernel_invocation_type", fmt::format("choose the kernel invocation type when using SYCL as backend: {}", fmt::join(sycl::list_available_sycl_kernel_invocation_types(), "|")), cxxopts::value<decltype(sycl_kernel_invocation_type)>()->default_value(fmt::format("{}", sycl_kernel_invocation_type)))
            ("sycl_implementation_type", fmt::format("choose the SYCL implementation to be used in the SYCL backend: {}", fmt::join(sycl::list_available_sycl_implementations(), "|")), cxxopts::value<decltype(sycl_implementation_type)>()->default_value(fmt::format("{}", sycl_implementation_type)))
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
diff --git a/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp b/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp
index 448c9bc97..446374ef6 100644
--- a/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp
+++ b/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp
@@ -14,7 +14,7 @@
 #include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/detail/arithmetic_type_name.hpp"            // plssvm::detail::arithmetic_type_name
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
-#include "plssvm/parameter.hpp"                              // plssvm::parameter, plssvm::kernel_type, plssvm::cost
+#include "plssvm/parameter.hpp"                              // plssvm::parameter, plssvm::kernel_type, plssvm::cost, plssvm::sycl_kernel_invocation_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "tests/backends/generic_base_csvc_tests.hpp"                 // generic C-SVC tests to instantiate
@@ -49,7 +49,7 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, default_construct) {
 
     // default constructor must always work
     EXPECT_NO_THROW(csvm_type{});
-    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 }
 
 TYPED_TEST(AdaptiveCppCSVMConstructor, construct_parameter) {
@@ -57,7 +57,7 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_parameter) {
 
     // the automatic target platform must always be available
     EXPECT_NO_THROW(csvm_type{ plssvm::parameter{} });
-    EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 }
 
 TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_parameter) {
@@ -69,33 +69,33 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_parameter) {
     // every target is allowed for SYCL
 #if defined(PLSSVM_HAS_CPU_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_NVIDIA_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_AMD_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_INTEL_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
@@ -107,7 +107,7 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_named_args) {
     // every target is allowed for SYCL
     EXPECT_NO_THROW((csvm_type{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 }
 
 TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_named_args) {
@@ -117,48 +117,48 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_named_args) {
 #if defined(PLSSVM_HAS_CPU_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_NVIDIA_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_AMD_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_INTEL_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
@@ -174,18 +174,24 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, get_kernel_invocation_type) {
     EXPECT_NE(svm.get_kernel_invocation_type(), plssvm::sycl::kernel_invocation_type::automatic);
 }
 
-template <bool mock_grid_size>
+template <bool mock_grid_size, plssvm::sycl::kernel_invocation_type invocation_type>
 struct adaptivecpp_csvm_test_type {
     using mock_csvm_type = mock_adaptivecpp_csvm<mock_grid_size>;
     using csvm_type = plssvm::adaptivecpp::csvm;
     using csvc_type = plssvm::adaptivecpp::csvc;
     using csvr_type = plssvm::adaptivecpp::csvr;
     using device_ptr_type = typename csvm_type::device_ptr_type;
-    inline constexpr static auto additional_arguments = std::make_tuple();
+    inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::sycl_kernel_invocation_type, invocation_type));
 };
 
 // a tuple containing the test structs
-using adaptivecpp_csvm_test_tuple = std::tuple<adaptivecpp_csvm_test_type<false>>;
+using adaptivecpp_csvm_test_tuple = std::tuple<
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+    adaptivecpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::hierarchical>,
+    adaptivecpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::scoped>,
+#endif
+    adaptivecpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::basic>,
+    adaptivecpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::work_group>>;
 
 // the tests used in the instantiated GTest test suites
 // general test types
@@ -231,7 +237,14 @@ INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppCSVM, GenericGPUCSVMKernelFunction, ad
 // generic GPU C-SVM DeathTests - correct grid sizes
 INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppCSVMDeathTest, GenericGPUCSVMDeathTest, adaptivecpp_csvm_test_type_gtest, naming::test_parameter_to_name);
 
-using adaptivecpp_mock_csvm_test_tuple = std::tuple<adaptivecpp_csvm_test_type<true>>;
+using adaptivecpp_mock_csvm_test_tuple = std::tuple<
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+    adaptivecpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::hierarchical>,
+    adaptivecpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::scoped>,
+#endif
+    adaptivecpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::basic>,
+    adaptivecpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::work_group>>;
+
 using adaptivecpp_mock_csvm_test_type_list = util::cartesian_type_product_t<adaptivecpp_mock_csvm_test_tuple>;
 
 using adaptivecpp_mock_csvm_test_type_gtest = util::combine_test_parameters_gtest_t<adaptivecpp_mock_csvm_test_type_list>;
diff --git a/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp b/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp
index a3097380c..e909fa2a9 100644
--- a/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp
+++ b/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp
@@ -10,10 +10,11 @@
 
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp"
 
-#include "plssvm/backends/execution_range.hpp"  // plssvm::detail::dim_type
-#include "plssvm/target_platforms.hpp"          // plssvm::target_platform
+#include "plssvm/backends/execution_range.hpp"               // plssvm::detail::dim_type
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
-#include "sycl/sycl.hpp"  // sycl::range
+#include "sycl/sycl.hpp"  // sycl::range, sycl::nd_range
 
 #include "gtest/gtest.h"  // TEST, EXPECT_NE, EXPECT_FALSE
 
@@ -60,6 +61,50 @@ TEST(AdaptiveCppUtility, dim_type_to_native_3) {
     EXPECT_EQ(native_dim[2], dim.x);
 }
 
+TEST(AdaptiveCppUtility, get_execution_range_basic) {
+    // create a grid
+    const plssvm::detail::dim_type grid{ 64ull, 64ull };
+    const plssvm::detail::dim_type block{ 8ull, 8ull };
+
+    // calculate the SYCL execution range
+    const ::sycl::range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::basic>(grid, block);
+
+    EXPECT_EQ(exec, (sycl::range<2>{ 512ull, 512ull }));
+}
+
+TEST(AdaptiveCppUtility, get_execution_range_work_group) {
+    // create a grid
+    const plssvm::detail::dim_type grid{ 64ull, 64ull };
+    const plssvm::detail::dim_type block{ 8ull, 8ull };
+
+    // calculate the SYCL execution range
+    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::work_group>(grid, block);
+
+    EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 512ull, 512ull }, ::sycl::range<2>{ 8ull, 8ull } }));
+}
+
+TEST(AdaptiveCppUtility, get_execution_range_hierarchical) {
+    // create a grid
+    const plssvm::detail::dim_type grid{ 64ull, 64ull };
+    const plssvm::detail::dim_type block{ 8ull, 8ull };
+
+    // calculate the SYCL execution range
+    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::hierarchical>(grid, block);
+
+    EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 64ull, 64ull }, ::sycl::range<2>{ 8ull, 8ull } }));
+}
+
+TEST(AdaptiveCppUtility, get_execution_range_scoped) {
+    // create a grid
+    const plssvm::detail::dim_type grid{ 64ull, 64ull };
+    const plssvm::detail::dim_type block{ 8ull, 8ull };
+
+    // calculate the SYCL execution range
+    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::scoped>(grid, block);
+
+    EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 64ull, 64ull }, ::sycl::range<2>{ 8ull, 8ull } }));
+}
+
 TEST(AdaptiveCppUtility, get_device_list) {
     const auto &[queues, actual_target] = plssvm::adaptivecpp::detail::get_device_list(plssvm::target_platform::automatic);
     // at least one queue must be provided
diff --git a/tests/backends/SYCL/DPCPP/detail/utility.cpp b/tests/backends/SYCL/DPCPP/detail/utility.cpp
index ca6eaa713..84b2d60f9 100644
--- a/tests/backends/SYCL/DPCPP/detail/utility.cpp
+++ b/tests/backends/SYCL/DPCPP/detail/utility.cpp
@@ -10,10 +10,11 @@
 
 #include "plssvm/backends/SYCL/DPCPP/detail/utility.hpp"
 
-#include "plssvm/backends/execution_range.hpp"  // plssvm::detail::dim_type
-#include "plssvm/target_platforms.hpp"          // plssvm::target_platform
+#include "plssvm/backends/execution_range.hpp"               // plssvm::detail::dim_type
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
-#include "sycl/sycl.hpp"  // sycl::range
+#include "sycl/sycl.hpp"  // sycl::range, sycl::nd_range
 
 #include "gtest/gtest.h"  // TEST, EXPECT_NE, EXPECT_FALSE
 
@@ -59,6 +60,39 @@ TEST(DPCPPUtility, dim_type_to_native_3) {
     EXPECT_EQ(native_dim[2], dim.x);
 }
 
+TEST(DPCPPUtility, get_execution_range_basic) {
+    // create a grid
+    const plssvm::detail::dim_type grid{ 64ull, 64ull };
+    const plssvm::detail::dim_type block{ 8ull, 8ull };
+
+    // calculate the SYCL execution range
+    const ::sycl::range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::basic>(grid, block);
+
+    EXPECT_EQ(exec, (sycl::range<2>{ 512ull, 512ull }));
+}
+
+TEST(DPCPPUtility, get_execution_range_work_group) {
+    // create a grid
+    const plssvm::detail::dim_type grid{ 64ull, 64ull };
+    const plssvm::detail::dim_type block{ 8ull, 8ull };
+
+    // calculate the SYCL execution range
+    const ::sycl::nd_range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::work_group>(grid, block);
+
+    EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 512ull, 512ull }, ::sycl::range<2>{ 8ull, 8ull } }));
+}
+
+TEST(DPCPPUtility, get_execution_range_hierarchical) {
+    // create a grid
+    const plssvm::detail::dim_type grid{ 64ull, 64ull };
+    const plssvm::detail::dim_type block{ 8ull, 8ull };
+
+    // calculate the SYCL execution range
+    const ::sycl::nd_range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::hierarchical>(grid, block);
+
+    EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 64ull, 64ull }, ::sycl::range<2>{ 8ull, 8ull } }));
+}
+
 TEST(DPCPPUtility, get_device_list) {
     const auto &[queues, actual_target] = plssvm::dpcpp::detail::get_device_list(plssvm::target_platform::automatic);
     // at least one queue must be provided
@@ -67,12 +101,12 @@ TEST(DPCPPUtility, get_device_list) {
     EXPECT_NE(actual_target, plssvm::target_platform::automatic);
 }
 
-TEST(AdaptiveCppUtility, get_dpcpp_version) {
+TEST(DPCPPUtility, get_dpcpp_version) {
     const std::regex reg{ "[0-9]+\\.[0-9]+\\.[0-9]+", std::regex::extended };
     EXPECT_TRUE(std::regex_match(plssvm::dpcpp::detail::get_dpcpp_version(), reg));
 }
 
-TEST(AdaptiveCppUtility, get_dpcpp_timestamp_version) {
+TEST(DPCPPUtility, get_dpcpp_timestamp_version) {
     const std::string version = plssvm::dpcpp::detail::get_dpcpp_timestamp_version();
     EXPECT_FALSE(version.empty());
 }
diff --git a/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp b/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp
index b8ca19bf8..ada2f4b56 100644
--- a/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp
+++ b/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp
@@ -14,7 +14,7 @@
 #include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/detail/arithmetic_type_name.hpp"            // plssvm::detail::arithmetic_type_name
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
-#include "plssvm/parameter.hpp"                              // plssvm::parameter, plssvm::kernel_type, plssvm::cost
+#include "plssvm/parameter.hpp"                              // plssvm::parameter, plssvm::kernel_type, plssvm::cost, plssvm::sycl_kernel_invocation_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "tests/backends/generic_base_csvc_tests.hpp"     // generic C-SVC tests to instantiate
@@ -49,7 +49,7 @@ TYPED_TEST(DPCPPCSVMConstructor, default_construct) {
 
     // default constructor must always work
     EXPECT_NO_THROW(csvm_type{});
-    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 }
 
 TYPED_TEST(DPCPPCSVMConstructor, construct_parameter) {
@@ -57,7 +57,7 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_parameter) {
 
     // the automatic target platform must always be available
     EXPECT_NO_THROW(csvm_type{ plssvm::parameter{} });
-    EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 }
 
 TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_parameter) {
@@ -69,33 +69,33 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_parameter) {
     // every target is allowed for SYCL
 #if defined(PLSSVM_HAS_CPU_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_NVIDIA_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_AMD_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_INTEL_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
@@ -107,7 +107,7 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_named_args) {
     // every target is allowed for SYCL
     EXPECT_NO_THROW((csvm_type{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 }
 
 TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_named_args) {
@@ -117,48 +117,48 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_named_args) {
 #if defined(PLSSVM_HAS_CPU_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_NVIDIA_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_AMD_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_INTEL_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range }),
+                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
@@ -174,18 +174,23 @@ TYPED_TEST(DPCPPCSVMConstructor, get_kernel_invocation_type) {
     EXPECT_NE(svm.get_kernel_invocation_type(), plssvm::sycl::kernel_invocation_type::automatic);
 }
 
-template <bool mock_grid_size>
+template <bool mock_grid_size, plssvm::sycl::kernel_invocation_type invocation_type>
 struct dpcpp_csvm_test_type {
     using mock_csvm_type = mock_dpcpp_csvm<mock_grid_size>;
     using csvm_type = plssvm::dpcpp::csvm;
     using csvc_type = plssvm::dpcpp::csvc;
     using csvr_type = plssvm::dpcpp::csvr;
     using device_ptr_type = typename csvm_type::device_ptr_type;
-    inline static auto additional_arguments = std::make_tuple();
+    inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::sycl_kernel_invocation_type, invocation_type));
 };
 
 // a tuple containing the test structs
-using dpcpp_csvm_test_tuple = std::tuple<dpcpp_csvm_test_type<false>>;
+using dpcpp_csvm_test_tuple = std::tuple<
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+    dpcpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::hierarchical>,
+#endif
+    dpcpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::basic>,
+    dpcpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::work_group>>;
 
 // the tests used in the instantiated GTest test suites
 // general test types
@@ -231,7 +236,13 @@ INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPCSVM, GenericGPUCSVMKernelFunction, dpcpp_ke
 // generic GPU C-SVM DeathTests - correct grid sizes
 INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPCSVMDeathTest, GenericGPUCSVMDeathTest, dpcpp_csvm_test_type_gtest, naming::test_parameter_to_name);
 
-using dpcpp_mock_csvm_test_tuple = std::tuple<dpcpp_csvm_test_type<true>>;
+using dpcpp_mock_csvm_test_tuple = std::tuple<
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+    dpcpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::hierarchical>,
+#endif
+    dpcpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::basic>,
+    dpcpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::work_group>>;
+
 using dpcpp_mock_csvm_test_type_list = util::cartesian_type_product_t<dpcpp_mock_csvm_test_tuple>;
 
 using dpcpp_mock_csvm_test_type_gtest = util::combine_test_parameters_gtest_t<dpcpp_mock_csvm_test_type_list>;
diff --git a/tests/backends/SYCL/implementation_types.cpp b/tests/backends/SYCL/implementation_types.cpp
index 9f0819cc3..f7f31b45d 100644
--- a/tests/backends/SYCL/implementation_types.cpp
+++ b/tests/backends/SYCL/implementation_types.cpp
@@ -12,7 +12,8 @@
 
 #include "tests/custom_test_macros.hpp"  // EXPECT_CONVERSION_TO_STRING, EXPECT_CONVERSION_FROM_STRING
 
-#include "gtest/gtest.h"  // TEST, EXPECT_TRUE
+#include "gmock/gmock.h"  // EXPECT_THAT; ::testing::Contains
+#include "gtest/gtest.h"  // TEST, EXPECT_TRUE, EXPECT_GE
 
 #include <sstream>  // std::istringstream
 
diff --git a/tests/backends/SYCL/kernel_invocation_types.cpp b/tests/backends/SYCL/kernel_invocation_types.cpp
index bcb2034b6..3227cb077 100644
--- a/tests/backends/SYCL/kernel_invocation_types.cpp
+++ b/tests/backends/SYCL/kernel_invocation_types.cpp
@@ -12,7 +12,8 @@
 
 #include "tests/custom_test_macros.hpp"  // EXPECT_CONVERSION_TO_STRING, EXPECT_CONVERSION_FROM_STRING
 
-#include "gtest/gtest.h"  // TEST, EXPECT_TRUE
+#include "gmock/gmock.h"  // EXPECT_THAT; ::testing::Contains
+#include "gtest/gtest.h"  // TEST, EXPECT_TRUE, EXPECT_GE
 
 #include <sstream>  // std::istringstream
 
@@ -20,12 +21,15 @@
 TEST(SYCLKernelInvocationType, to_string) {
     // check conversions to std::string
     EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::automatic, "automatic");
-    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::nd_range, "nd_range");
+    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::basic, "basic");
+    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::work_group, "work_group");
+    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::hierarchical, "hierarchical");
+    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::scoped, "scoped");
 }
 
 TEST(SYCLKernelInvocationType, to_string_unknown) {
     // check conversions to std::string from unknown file_format_type
-    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::sycl::kernel_invocation_type>(3), "unknown");
+    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::sycl::kernel_invocation_type>(5), "unknown");
 }
 
 // check whether the std::string -> plssvm::sycl::kernel_invocation_type conversions are correct
@@ -35,8 +39,16 @@ TEST(SYCLKernelInvocationType, from_string) {
     EXPECT_CONVERSION_FROM_STRING("AUTOMATIC", plssvm::sycl::kernel_invocation_type::automatic);
     EXPECT_CONVERSION_FROM_STRING("auto", plssvm::sycl::kernel_invocation_type::automatic);
     EXPECT_CONVERSION_FROM_STRING("AUTO", plssvm::sycl::kernel_invocation_type::automatic);
-    EXPECT_CONVERSION_FROM_STRING("nd_range", plssvm::sycl::kernel_invocation_type::nd_range);
-    EXPECT_CONVERSION_FROM_STRING("ND_RANGE", plssvm::sycl::kernel_invocation_type::nd_range);
+    EXPECT_CONVERSION_FROM_STRING("basic", plssvm::sycl::kernel_invocation_type::basic);
+    EXPECT_CONVERSION_FROM_STRING("BASIC", plssvm::sycl::kernel_invocation_type::basic);
+    EXPECT_CONVERSION_FROM_STRING("work_group", plssvm::sycl::kernel_invocation_type::work_group);
+    EXPECT_CONVERSION_FROM_STRING("WORK-GROUP", plssvm::sycl::kernel_invocation_type::work_group);
+    EXPECT_CONVERSION_FROM_STRING("nd_range", plssvm::sycl::kernel_invocation_type::work_group);
+    EXPECT_CONVERSION_FROM_STRING("ND-RANGE", plssvm::sycl::kernel_invocation_type::work_group);
+    EXPECT_CONVERSION_FROM_STRING("hierarchical", plssvm::sycl::kernel_invocation_type::hierarchical);
+    EXPECT_CONVERSION_FROM_STRING("HIERARCHICAL", plssvm::sycl::kernel_invocation_type::hierarchical);
+    EXPECT_CONVERSION_FROM_STRING("scoped", plssvm::sycl::kernel_invocation_type::scoped);
+    EXPECT_CONVERSION_FROM_STRING("SCOPED", plssvm::sycl::kernel_invocation_type::scoped);
 }
 
 TEST(SYCLKernelInvocationType, from_string_unknown) {
@@ -46,3 +58,15 @@ TEST(SYCLKernelInvocationType, from_string_unknown) {
     input >> invocation_type;
     EXPECT_TRUE(input.fail());
 }
+
+TEST(SYCLKernelInvocationType, minimal_available_sycl_kernel_invocation_types) {
+    const std::vector<plssvm::sycl::kernel_invocation_type> invocation_type = plssvm::sycl::list_available_sycl_kernel_invocation_types();
+
+    // at least three must be available (automatic, basic, and work_group)!
+    EXPECT_GE(invocation_type.size(), 3);
+
+    // check for the kernel invocation types that must always be present
+    EXPECT_THAT(invocation_type, ::testing::Contains(plssvm::sycl::kernel_invocation_type::automatic));
+    EXPECT_THAT(invocation_type, ::testing::Contains(plssvm::sycl::kernel_invocation_type::basic));
+    EXPECT_THAT(invocation_type, ::testing::Contains(plssvm::sycl::kernel_invocation_type::work_group));
+}
diff --git a/tests/detail/cmd/parser_predict.cpp b/tests/detail/cmd/parser_predict.cpp
index f8ee46ed3..8365494b7 100644
--- a/tests/detail/cmd/parser_predict.cpp
+++ b/tests/detail/cmd/parser_predict.cpp
@@ -10,13 +10,14 @@
 
 #include "plssvm/detail/cmd/parser_predict.hpp"
 
-#include "plssvm/backend_types.hpp"                       // plssvm::backend_type
-#include "plssvm/backends/Kokkos/execution_space.hpp"     // plssvm::kokkos::execution_space
-#include "plssvm/backends/SYCL/implementation_types.hpp"  // plssvm::sycl::implementation_type
-#include "plssvm/constants.hpp"                           // plssvm::real_type
-#include "plssvm/exceptions/exceptions.hpp"               // plssvm::cmd_parser_exit
-#include "plssvm/target_platforms.hpp"                    // plssvm::target_platform
-#include "plssvm/verbosity_levels.hpp"                    // plssvm::verbosity
+#include "plssvm/backend_types.hpp"                          // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/constants.hpp"                              // plssvm::real_type
+#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::cmd_parser_exit
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/verbosity_levels.hpp"                       // plssvm::verbosity
 
 #include "tests/custom_test_macros.hpp"      // EXPECT_CONVERSION_TO_STRING, EXPECT_THROW_WHAT
 #include "tests/detail/cmd/cmd_utility.hpp"  // util::ParameterBase
@@ -47,7 +48,9 @@ TEST_F(ParserPredict, minimal) {
     // check parsed values
     EXPECT_EQ(parser.backend, plssvm::backend_type::automatic);
     EXPECT_EQ(parser.target, plssvm::target_platform::automatic);
+    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic);
+    EXPECT_EQ(parser.kokkos_execution_space, plssvm::kokkos::execution_space::automatic);
     EXPECT_FALSE(parser.strings_as_labels);
     EXPECT_EQ(parser.input_filename, "data.libsvm");
     EXPECT_EQ(parser.model_filename, "data.libsvm.model");
@@ -69,6 +72,7 @@ TEST_F(ParserPredict, minimal_output) {
         "backend: automatic\n"
         "target platform: automatic\n"
         "SYCL implementation type: automatic\n"
+        "SYCL kernel invocation type: automatic\n"
         "Kokkos execution space: automatic\n"
         "label_type: int (default)\n"
         "real_type: {}\n"
@@ -86,7 +90,7 @@ TEST_F(ParserPredict, all_arguments) {
     // create artificial command line arguments in test fixture
     std::vector<std::string> cmd_args = { "./plssvm-predict", "--backend", "cuda", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    cmd_args.insert(cmd_args.end(), { "--sycl_implementation_type", "dpcpp" });
+    cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" });
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1];  // [0] would be automatic
@@ -108,8 +112,10 @@ TEST_F(ParserPredict, all_arguments) {
     EXPECT_EQ(parser.backend, plssvm::backend_type::cuda);
     EXPECT_EQ(parser.target, plssvm::target_platform::gpu_nvidia);
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
+    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::work_group);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::dpcpp);
 #else
+    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic);
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
@@ -137,7 +143,7 @@ TEST_F(ParserPredict, all_arguments_output) {
     // create artificial command line arguments in test fixture
     std::vector<std::string> cmd_args = { "./plssvm-predict", "--backend", "automatic", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    cmd_args.insert(cmd_args.end(), { "--sycl_implementation_type", "dpcpp" });
+    cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" });
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1];  // [0] would be automatic
@@ -161,9 +167,11 @@ TEST_F(ParserPredict, all_arguments_output) {
         "target platform: gpu_nvidia\n"
     };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    correct += "SYCL implementation type: dpcpp\n";
+    correct += "SYCL implementation type: dpcpp\n"
+               "SYCL kernel invocation type: work_group\n";
 #else
-    correct += "SYCL implementation type: automatic\n";
+    correct += "SYCL implementation type: automatic\n"
+               "SYCL kernel invocation type: automatic\n";
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     correct += fmt::format("Kokkos execution space: {}\n", space);
@@ -236,6 +244,28 @@ INSTANTIATE_TEST_SUITE_P(ParserPredict, ParserPredictTargetPlatform, ::testing::
 
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
 
+class ParserPredictSYCLKernelInvocation : public ParserPredict,
+                                          public ::testing::WithParamInterface<std::tuple<std::string, std::string>> { };
+
+TEST_P(ParserPredictSYCLKernelInvocation, parsing) {
+    const auto &[flag, value] = GetParam();
+    // convert string to sycl::kernel_invocation_type
+    const auto sycl_kernel_invocation_type = util::convert_from_string<plssvm::sycl::kernel_invocation_type>(value);
+    // create artificial command line arguments in test fixture
+    this->CreateCMDArgs({ "./plssvm-predict", flag, value, "data.libsvm" });
+    // create parameter object
+    const plssvm::detail::cmd::parser_predict parser{ this->get_comm(), this->get_argc(), this->get_argv() };
+    // test for correctness
+    EXPECT_EQ(parser.sycl_kernel_invocation_type, sycl_kernel_invocation_type);
+}
+
+// clang-format off
+INSTANTIATE_TEST_SUITE_P(ParserPredict, ParserPredictSYCLKernelInvocation, ::testing::Combine(
+                ::testing::Values("--sycl_kernel_invocation_type"),
+                ::testing::Values("automatic", "auto", "basic", "nd_range", "work_group", "hierarchical", "scoped")),
+                naming::pretty_print_parameter_flag_and_value<ParserPredictSYCLKernelInvocation>);
+// clang-format on
+
 class ParserPredictSYCLImplementation : public ParserPredict,
                                         public ::testing::WithParamInterface<std::tuple<std::string, std::string>> { };
 
diff --git a/tests/detail/cmd/parser_train.cpp b/tests/detail/cmd/parser_train.cpp
index 94337f90b..36e70228c 100644
--- a/tests/detail/cmd/parser_train.cpp
+++ b/tests/detail/cmd/parser_train.cpp
@@ -63,6 +63,7 @@ TEST_F(ParserTrain, minimal) {
     EXPECT_EQ(parser.solver, plssvm::solver_type::automatic);
     EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic);
+    EXPECT_EQ(parser.kokkos_execution_space, plssvm::kokkos::execution_space::automatic);
     EXPECT_FALSE(parser.strings_as_labels);
     EXPECT_EQ(parser.input_filename, "data.libsvm");
     EXPECT_EQ(parser.model_filename, "data.libsvm.model");
@@ -107,7 +108,7 @@ TEST_F(ParserTrain, all_arguments) {
     // create artificial command line arguments in test fixture
     std::vector<std::string> cmd_args = { "./plssvm-train", "--svm_type", "1", "--kernel_type", "1", "--degree", "2", "--gamma", "1.5", "--coef0", "-1.5", "--cost", "2", "--epsilon", "1e-12", "--max_iter", "100", "--classification", "oao", "--solver", "cg_implicit", "--backend", "cuda", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "nd_range", "--sycl_implementation_type", "dpcpp" });
+    cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" });
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1];  // [0] would be automatic
@@ -141,7 +142,7 @@ TEST_F(ParserTrain, all_arguments) {
     EXPECT_EQ(parser.target, plssvm::target_platform::gpu_nvidia);
     EXPECT_EQ(parser.solver, plssvm::solver_type::cg_implicit);
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::nd_range);
+    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::work_group);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::dpcpp);
 #else
     EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic);
@@ -169,7 +170,7 @@ TEST_F(ParserTrain, all_arguments_output) {
     // create artificial command line arguments in test fixture
     std::vector<std::string> cmd_args = { "./plssvm-train", "--svm_type", "1", "--kernel_type", "1", "--degree", "2", "--gamma", "1.5", "--coef0", "-1.5", "--cost", "2", "--epsilon", "1e-12", "--max_iter", "100", "--classification", "oao", "--solver", "cg_implicit", "--backend", "automatic", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "nd_range", "--sycl_implementation_type", "dpcpp" });
+    cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" });
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     const std::string space = fmt::format("{}", plssvm::kokkos::list_available_execution_spaces()[1]);  // [0] would be automatic
@@ -202,7 +203,7 @@ TEST_F(ParserTrain, all_arguments_output) {
         "solver: cg_implicit\n";
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
     correct += "SYCL implementation type: dpcpp\n"
-               "SYCL kernel invocation type: nd_range\n";
+               "SYCL kernel invocation type: work_group\n";
 #else
     correct += "SYCL implementation type: automatic\n"
                "SYCL kernel invocation type: automatic\n";
@@ -547,7 +548,7 @@ TEST_P(ParserTrainSYCLKernelInvocation, parsing) {
 // clang-format off
 INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserTrainSYCLKernelInvocation, ::testing::Combine(
                 ::testing::Values("--sycl_kernel_invocation_type"),
-                ::testing::Values("automatic", "nd_range", "ND_RANGE")),
+                ::testing::Values("automatic", "auto", "basic", "nd_range", "work_group", "hierarchical", "scoped")),
                 naming::pretty_print_parameter_flag_and_value<ParserTrainSYCLKernelInvocation>);
 // clang-format on
 
diff --git a/tests/parameter.cpp b/tests/parameter.cpp
index 80de6eec7..940aa7d08 100644
--- a/tests/parameter.cpp
+++ b/tests/parameter.cpp
@@ -99,7 +99,7 @@ TEST(Parameter, construct_parameter_and_named_args) {
     const plssvm::parameter param{ param_base,
                                    plssvm::kernel_type = plssvm::kernel_function_type::rbf,
                                    plssvm::sycl_implementation_type = plssvm::sycl::implementation_type::adaptivecpp,
-                                   plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::nd_range,
+                                   plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group,
                                    plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda };
 
     // test default values
diff --git a/utility_scripts/performance_analysis.py b/utility_scripts/performance_analysis.py
index 675fb5202..48a5cb179 100644
--- a/utility_scripts/performance_analysis.py
+++ b/utility_scripts/performance_analysis.py
@@ -120,7 +120,7 @@ def fit_model_with_timeout(csvm, data, eps):
                 if sycl_impl == plssvm.sycl.ImplementationType.AUTOMATIC:
                     continue
                 available_backends.append((backend, { "sycl_implementation_type":    sycl_impl,
-                                                      "sycl_kernel_invocation_type": plssvm.sycl.KernelInvocationType.ND_RANGE }))
+                                                      "sycl_kernel_invocation_type": plssvm.sycl.KernelInvocationType.WORK_GROUP }))
         else:
             available_backends.append((backend, { }))