Skip to content

Commit 5b2706c

Browse files
authored
Merge pull request #13 from SC-SGS/hierarchical_kernel
Hierarchical kernel
2 parents f17ad62 + 2d34157 commit 5b2706c

29 files changed

+912
-237
lines changed

.jenkins/Jenkinsfile-tests

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ pipeline {
198198
}
199199
}
200200
}
201+
/*
201202
stage('build plssvm DPC++ Debug') {
202203
steps {
203204
dir('plssvm') {
@@ -213,6 +214,7 @@ pipeline {
213214
}
214215
}
215216
}
217+
*/
216218
}
217219
post {
218220
always {

CMakeLists.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66

77
cmake_minimum_required(VERSION 3.18)
88

9-
project("PLSSVM - Parallel Least-Squares Support Vector Machine"
10-
VERSION 1.0.1
9+
project("PLSSVM - Parallel Least Squares Support Vector Machine"
10+
VERSION 1.1.0
1111
LANGUAGES CXX
12-
DESCRIPTION "A Support Vector Machine implementation using different backends.")
12+
DESCRIPTION "A Least Squares Support Vector Machine implementation using different backends.")
1313

1414

1515
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/add_custom_build_type.cmake)
@@ -39,6 +39,7 @@ set(PLSSVM_BASE_SOURCES
3939
${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/parameter_predict.cpp
4040
${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/parameter_train.cpp
4141
${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/target_platforms.cpp
42+
${CMAKE_CURRENT_LIST_DIR}/src/plssvm/backends/SYCL/kernel_invocation_type.cpp
4243
)
4344

4445
## create base library: linked against all backend libraries

README.md

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -259,9 +259,9 @@ LS-SVM with multiple (GPU-)backends
259259
Usage:
260260
./svm-train [OPTION...] training_set_file [model_file]
261261

262-
-t, --kernel_type arg set type of kernel function.
262+
-t, --kernel_type arg set type of kernel function.
263263
0 -- linear: u'*v
264-
1 -- polynomial: (gamma*u'*v + coef0)^degree
264+
1 -- polynomial: (gamma*u'*v + coef0)^degree
265265
2 -- radial basis function: exp(-gamma*|u-v|^2) (default: 0)
266266
-d, --degree arg set degree in kernel function (default: 3)
267267
-g, --gamma arg set gamma in kernel function (default: 1 / num_features)
@@ -270,11 +270,13 @@ Usage:
270270
-e, --epsilon arg set the tolerance of termination criterion (default: 0.001)
271271
-b, --backend arg choose the backend: openmp|cuda|opencl|sycl (default: openmp)
272272
-p, --target_platform arg choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic)
273+
--sycl_kernel_invocation_type arg
274+
choose the kernel invocation type when using SYCL as backend: automatic|nd_range|hierarchical (default: automatic)
273275
-q, --quiet quiet mode (no outputs)
274276
-h, --help print this helper message
275277
--input training_set_file
276-
277-
--model model_file
278+
279+
--model model_file
278280
```
279281
280282
An example invocation using the CUDA backend could look like:
@@ -289,13 +291,17 @@ Another example targeting NVIDIA GPUs using the SYCL backend looks like:
289291
./svm-train --backend sycl --target_platform gpu_nvidia --input /path/to/data_file
290292
```
291293
292-
The `--target_platform=automatic` flags works for the different backends as follows:
294+
The `--target_platform=automatic` flag works for the different backends as follows:
293295
294296
- `OpenMP`: always selects a CPU
295297
- `CUDA`: always selects an NVIDIA GPU (if no NVIDIA GPU is available, throws an exception)
296298
- `OpenCL`: tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU
297299
- `SYCL`: tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU
298300
301+
The `--sycl_kernel_invocation_type` flag is only used if the `--backend` is `sycl`, otherwise a warning is emitted on `stderr`.
302+
If the `--sycl_kernel_invocation_type` is `automatic`, the `nd_range` invocation type is always used,
303+
except for hipSYCL on CPUs where the hierarchical formulation is used instead.
304+
299305
### Predicting
300306
301307
```bash

include/plssvm/backends/OpenCL/detail/utility.hpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,17 +46,18 @@ namespace plssvm::opencl::detail {
4646
void device_assert(error_code code, std::string_view msg = "");
4747

4848
/**
49-
* @brief Returns the list devices matching the target platform @p target.
49+
* @brief Returns the list devices matching the target platform @p target and the actually used target platform
50+
* (only interesting if the provided @p target was automatic).
5051
* @details If the selected target platform is `plssvm::target_platform::automatic` the selector tries to find devices in the following order:
5152
* 1. NVIDIA GPUs
5253
* 2. AMD GPUs
5354
* 3. Intel GPUs
5455
* 4. CPUs
5556
*
5657
* @param[in] target the target platform for which the devices must match
57-
* @return the command queues (`[[nodiscard]]`)
58+
* @return the command queues and used target platform (`[[nodiscard]]`)
5859
*/
59-
[[nodiscard]] std::vector<command_queue> get_command_queues(target_platform target);
60+
[[nodiscard]] std::pair<std::vector<command_queue>, target_platform> get_command_queues(target_platform target);
6061

6162
/**
6263
* @brief Wait for the compute device associated with @p queue to finish.

include/plssvm/backends/SYCL/csvm.hpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@
1111

1212
#pragma once
1313

14-
#include "plssvm/backends/SYCL/detail/device_ptr.hpp" // plssvm::sycl::detail::device_ptr
15-
#include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm
14+
#include "plssvm/backends/SYCL/detail/device_ptr.hpp" // plssvm::sycl::detail::device_ptr
15+
#include "plssvm/backends/SYCL/kernel_invocation_type.hpp" // plssvm::sycl::kernel_invocation_type
16+
#include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm
1617

1718
#include "sycl/sycl.hpp" // sycl::queue
1819

@@ -45,6 +46,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<T, ::plssvm::sycl::detail::device
4546
using base_type::coef0_;
4647
using base_type::cost_;
4748
using base_type::degree_;
49+
using base_type::dept_;
4850
using base_type::gamma_;
4951
using base_type::kernel_;
5052
using base_type::num_data_points_;
@@ -105,6 +107,10 @@ class csvm : public ::plssvm::detail::gpu_csvm<T, ::plssvm::sycl::detail::device
105107
* @copydoc plssvm::detail::gpu_csvm::run_predict_kernel
106108
*/
107109
void run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, std::size_t num_predict_points) final;
110+
111+
private:
112+
/// The SYCL kernel invocation type for the svm kernel. Either nd_range or hierarchical.
113+
kernel_invocation_type invocation_type_;
108114
};
109115

110116
extern template class csvm<float>;

include/plssvm/backends/SYCL/detail/utility.hpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,24 @@
1515

1616
#include "sycl/sycl.hpp" // sycl::queue
1717

18-
#include <vector> // std::vector
18+
#include <utility> // std::pair
19+
#include <vector> // std::vector
1920

2021
namespace plssvm::sycl::detail {
2122

2223
/**
23-
* @brief Returns the list devices matching the target platform @p target.
24+
* @brief Returns the list devices matching the target platform @p target and the actually used target platform
25+
* (only interesting if the provided @p target was automatic).
2426
* @details If the selected target platform is `plssvm::target_platform::automatic` the selector tries to find devices in the following order:
2527
* 1. NVIDIA GPUs
2628
* 2. AMD GPUs
2729
* 3. Intel GPUs
2830
* 4. CPUs
2931
*
3032
* @param[in] target the target platform for which the devices must match
31-
* @return the devices (`[[nodiscard]]`)
33+
* @return the devices and used target platform (`[[nodiscard]]`)
3234
*/
33-
[[nodiscard]] std::vector<::sycl::queue> get_device_list(target_platform target);
35+
[[nodiscard]] std::pair<std::vector<::sycl::queue>, target_platform> get_device_list(target_platform target);
3436
/**
3537
* @brief Wait for the compute device associated with @p queue to finish.
3638
* @param[in] queue the SYCL queue to synchronize
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/**
2+
* @file
3+
* @author Alexander Van Craen
4+
* @author Marcel Breyer
5+
* @copyright 2018-today The PLSSVM project - All Rights Reserved
6+
* @license This file is part of the PLSSVM project which is released under the MIT license.
7+
* See the LICENSE.md file in the project root for full license information.
8+
*
9+
* @brief Defines all available kernel invoke types when using SYCL.
10+
*/
11+
12+
#pragma once
13+
14+
#include <iosfwd> // forward declare std::ostream and std::istream
15+
16+
namespace plssvm::sycl {
17+
18+
/**
19+
* @brief Enum class for all possible SYCL kernel invocation types.
20+
*/
21+
enum class kernel_invocation_type {
22+
/** Use the best kernel invocation type for the current SYCL implementation and target hardware platform. */
23+
automatic,
24+
/** Use the [*nd_range* invocation type](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_invoke). */
25+
nd_range,
26+
/** Use the SYCL specific [hierarchical invocation type](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_hierarchical_invoke). */
27+
hierarchical
28+
};
29+
30+
/**
31+
* @brief Output the @p invocation type to the given output-stream @p out.
32+
* @param[in,out] out the output-stream to write the backend type to
33+
* @param[in] invocation the SYCL kernel invocation type
34+
* @return the output-stream
35+
*/
36+
std::ostream &operator<<(std::ostream &out, kernel_invocation_type invocation);
37+
38+
/**
39+
* @brief Use the input-stream @p in to initialize the @p invocation type.
40+
* @param[in,out] in input-stream to extract the backend type from
41+
* @param[in] invocation the SYCL kernel invocation type
42+
* @return the input-stream
43+
*/
44+
std::istream &operator>>(std::istream &in, kernel_invocation_type &invocation);
45+
46+
} // namespace plssvm::sycl

include/plssvm/backends/SYCL/predict_kernel.hpp

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,11 @@ class device_kernel_w_linear {
4444

4545
/**
4646
* @brief Function call operator overload performing the actual calculation.
47-
* @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class)
48-
* identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
47+
* @param[in] index the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class)
48+
* identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
4949
*/
50-
void operator()(::sycl::nd_item<1> nd_idx) const {
51-
const kernel_index_type index = nd_idx.get_global_linear_id();
52-
real_type temp = 0;
50+
void operator()(::sycl::id<1> index) const {
51+
real_type temp{ 0.0 };
5352
if (index < num_features_) {
5453
for (kernel_index_type dat = 0; dat < num_data_points_ - 1; ++dat) {
5554
temp += alpha_d_[dat] * data_d_[dat + (num_data_points_ - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * index];
@@ -99,12 +98,11 @@ class device_kernel_predict_poly {
9998

10099
/**
101100
* @brief Function call operator overload performing the actual calculation.
102-
* @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class)
103-
* identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
101+
* @param[in] idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
104102
*/
105-
void operator()(::sycl::nd_item<2> nd_idx) const {
106-
const kernel_index_type data_point_index = nd_idx.get_global_id(0);
107-
const kernel_index_type predict_point_index = nd_idx.get_global_id(1);
103+
void operator()(::sycl::nd_item<2> idx) const {
104+
const kernel_index_type data_point_index = idx.get_global_id(0);
105+
const kernel_index_type predict_point_index = idx.get_global_id(1);
108106

109107
real_type temp = 0;
110108
if (predict_point_index < num_predict_points_) {
@@ -165,12 +163,11 @@ class device_kernel_predict_radial {
165163

166164
/**
167165
* @brief Function call operator overload performing the actual calculation.
168-
* @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class)
169-
* identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
166+
* @param[in] idx the [`sycl::nd_item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#nditem-class) identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
170167
*/
171-
void operator()(::sycl::nd_item<2> nd_idx) const {
172-
const kernel_index_type data_point_index = nd_idx.get_global_id(0);
173-
const kernel_index_type predict_point_index = nd_idx.get_global_id(1);
168+
void operator()(::sycl::nd_item<2> idx) const {
169+
const kernel_index_type data_point_index = idx.get_global_id(0);
170+
const kernel_index_type predict_point_index = idx.get_global_id(1);
174171

175172
real_type temp = 0;
176173
if (predict_point_index < num_predict_points_) {

include/plssvm/backends/SYCL/q_kernel.hpp

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,10 @@ class device_kernel_q_linear {
4141

4242
/**
4343
* @brief Function call operator overload performing the actual calculation.
44-
* @param[in] item the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class)
45-
* identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
44+
* @param[in] index the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class)
45+
* identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
4646
*/
47-
void operator()(::sycl::nd_item<1> item) const {
48-
const kernel_index_type index = item.get_global_linear_id();
47+
void operator()(::sycl::id<1> index) const {
4948
real_type temp{ 0.0 };
5049
for (kernel_index_type i = 0; i < feature_range_; ++i) {
5150
temp += data_d_[i * num_rows_ + index] * data_last_[i];
@@ -88,11 +87,10 @@ class device_kernel_q_poly {
8887

8988
/**
9089
* @brief Function call operator overload performing the actual calculation.
91-
* @param[in] item the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class)
92-
* identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
90+
* @param[in] index the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class)
91+
* identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
9392
*/
94-
void operator()(::sycl::nd_item<1> item) const {
95-
const kernel_index_type index = item.get_global_linear_id();
93+
void operator()(::sycl::id<1> index) const {
9694
real_type temp{ 0.0 };
9795
for (kernel_index_type i = 0; i < num_cols_; ++i) {
9896
temp += data_d_[i * num_rows_ + index] * data_last_[i];
@@ -136,11 +134,10 @@ class device_kernel_q_radial {
136134

137135
/**
138136
* @brief Function call operator overload performing the actual calculation.
139-
* @param[in] item the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class)
140-
* identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
137+
* @param[in] index the [`sycl::id`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#id-class)
138+
* identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class)
141139
*/
142-
void operator()(::sycl::nd_item<1> item) const {
143-
const kernel_index_type index = item.get_global_linear_id();
140+
void operator()(::sycl::id<1> index) const {
144141
real_type temp{ 0.0 };
145142
for (kernel_index_type i = 0; i < num_cols_; ++i) {
146143
temp += (data_d_[i * num_rows_ + index] - data_last_[i]) * (data_d_[i * num_rows_ + index] - data_last_[i]);

0 commit comments

Comments
 (0)