@@ -55,6 +55,12 @@ struct OpenCLBenchmarkEnvironment
5555 ooq = cl::CommandQueue{context, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE};
5656 }
5757
58+ void TearDown (void )
59+ {
60+ ioq = NULL ;
61+ ooq = NULL ;
62+ }
63+
5864 cl::Platform platform;
5965 cl::Device device;
6066 cl::Context context;
@@ -208,6 +214,8 @@ struct Kernel : public benchmark::Fixture
208214 cl::Program program;
209215 cl::Kernel kernel;
210216
217+ clGetKernelSuggestedLocalWorkSizeKHR_fn clGetKernelSuggestedLocalWorkSizeKHR = nullptr ;
218+
211219 virtual void SetUp (benchmark::State& state) override {
212220 queue = env.ioq ;
213221
@@ -225,6 +233,9 @@ struct Kernel : public benchmark::Fixture
225233 kernel = cl::Kernel{program, " Silly" };
226234
227235 kernel.setArg (0 , nullptr );
236+
237+ clGetKernelSuggestedLocalWorkSizeKHR = (clGetKernelSuggestedLocalWorkSizeKHR_fn)
238+ clGetExtensionFunctionAddressForPlatform (env.platform (), " clGetKernelSuggestedLocalWorkSizeKHR" );
228239 }
229240 virtual void TearDown (benchmark::State& state) override {
230241 program = NULL ;
@@ -385,6 +396,48 @@ BENCHMARK_DEFINE_F(Kernel, clEnqueueNDRangeKernel_overhead)(benchmark::State& st
385396}
386397BENCHMARK_REGISTER_F (Kernel, clEnqueueNDRangeKernel_overhead)->ArgsProduct({{0 , 1 }, {1 , 32 *1024 *1024 }});
387398
399+ BENCHMARK_DEFINE_F (Kernel, clEnqueueNDRangeKernel_LocalWorkSize)(benchmark::State& state)
400+ {
401+ const bool useLocalWorkSize = state.range (0 ) == 1 ;
402+
403+ const size_t work_dim = 1 ;
404+ const size_t global_work_size[work_dim] = { 256 };
405+
406+ size_t suggested_local_work_size[work_dim];
407+ if (clGetKernelSuggestedLocalWorkSizeKHR) {
408+ clGetKernelSuggestedLocalWorkSizeKHR (
409+ queue (),
410+ kernel (),
411+ work_dim,
412+ NULL ,
413+ global_work_size,
414+ suggested_local_work_size);
415+ } else {
416+ suggested_local_work_size[0 ] = 64 ;
417+ }
418+
419+ const size_t * local_work_size = useLocalWorkSize ?
420+ suggested_local_work_size :
421+ NULL ;
422+ for (auto _ : state) {
423+ clEnqueueNDRangeKernel (
424+ queue (),
425+ kernel (),
426+ work_dim,
427+ NULL ,
428+ global_work_size,
429+ local_work_size,
430+ 0 ,
431+ NULL ,
432+ NULL );
433+ clFinish (queue ());
434+ }
435+
436+ clFinish (queue ());
437+ }
438+ BENCHMARK_REGISTER_F (Kernel, clEnqueueNDRangeKernel_LocalWorkSize)->Arg(0 )->ArgName(" NULL" );
439+ BENCHMARK_REGISTER_F (Kernel, clEnqueueNDRangeKernel_LocalWorkSize)->Arg(1 )->ArgName(" Suggested" );
440+
388441BENCHMARK_DEFINE_F (Kernel, clSetKernelArgSVMPointer_null)(benchmark::State& state)
389442{
390443 for (auto _ : state) {
@@ -589,10 +642,13 @@ int main(int argc, char** argv)
589642 env.ParseArgs (argc, argv);
590643
591644 ::benchmark::Initialize (&argc, argv);
645+ ::benchmark::ReportUnrecognizedArguments (argc, argv);
592646 // if (::benchmark::ReportUnrecognizedArguments(argc, argv)) {
593647 // return 1;
594648 // }
595649 ::benchmark::RunSpecifiedBenchmarks ();
596650 ::benchmark::Shutdown ();
651+
652+ env.TearDown ();
597653 return 0 ;
598654}
0 commit comments