Skip to content

Commit 3942c9c

Browse files
committed
GPU: Use more precise way to time HIP kernel event timers
1 parent 351b273 commit 3942c9c

File tree

2 files changed

+22
-11
lines changed

2 files changed

+22
-11
lines changed

GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@
3434
#ifndef GPUCA_KRNL_REG
3535
#define GPUCA_KRNL_REG(...)
3636
#endif
37+
#ifndef GPUCA_KRNL_BACKEND_XARGS
38+
#define GPUCA_KRNL_BACKEND_XARGS
39+
#endif
3740
#define GPUCA_ATTRRES_REG(reg, num, ...) GPUCA_KRNL_REG(num) GPUCA_ATTRRES2(__VA_ARGS__)
3841
#define GPUCA_ATTRRES2_REG(reg, num, ...) GPUCA_KRNL_REG(num) GPUCA_ATTRRES3(__VA_ARGS__)
3942
#define GPUCA_ATTRRES_NONE(...)
@@ -68,7 +71,7 @@ GPUg() void GPUCA_ATTRRES(GPUCA_M_SHIFT(GPUCA_M_STRIP(x_attributes))) GPUCA_M_CA
6871
template <> class GPUCA_KRNL_BACKEND_CLASS::backendInternal<GPUCA_M_KRNL_TEMPLATE(x_class)> { \
6972
public: \
7073
template <typename T, typename... Args> \
71-
static inline void runKernelBackendInternal(krnlSetup& _xyz, T* me, const Args&... args) \
74+
static inline void runKernelBackendInternal(krnlSetup& _xyz, T* me, GPUCA_KRNL_BACKEND_XARGS const Args&... args) \
7275
{ \
7376
auto& x = _xyz.x; \
7477
auto& y = _xyz.y;

GPU/GPUTracking/Base/hip/GPUReconstructionHIP.hip.cxx

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
#define GPUCA_GPUTYPE_VEGA
1515
#include <hip/hip_runtime.h>
16+
#include "hip/hip_ext.h"
1617

1718
#ifdef __CUDACC__
1819
#define __HIPCC_CUDA__
@@ -73,13 +74,22 @@ GPUg() void runKernelHIP(GPUCA_CONSMEM_PTR int iSlice, Args... args)
7374

7475
#undef GPUCA_KRNL_REG
7576
#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_STRIP(args))
77+
#undef GPUCA_KRNL_BACKEND_XARGS
78+
#define GPUCA_KRNL_BACKEND_XARGS hipEvent_t *start, hipEvent_t *stop,
7679
#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward) GPUCA_KRNL_WRAP(GPUCA_KRNL_, x_class, x_attributes, x_arguments, x_forward)
7780
#define GPUCA_KRNL_BACKEND_CLASS GPUReconstructionHIPBackend
78-
#define GPUCA_KRNL_CALL_single(x_class, x_attributes, x_arguments, x_forward) \
79-
hipLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->HIPStreams[x.stream], GPUCA_CONSMEM_CALL y.start, args...);
80-
#define GPUCA_KRNL_CALL_multi(x_class, x_attributes, x_arguments, x_forward) \
81-
hipLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT3(krnl_, GPUCA_M_KRNL_NAME(x_class), _multi)), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->HIPStreams[x.stream], GPUCA_CONSMEM_CALL y.start, y.num, args...);
82-
81+
#define GPUCA_KRNL_CALL_single(x_class, x_attributes, x_arguments, x_forward) \
82+
if (start == nullptr) { \
83+
hipLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->HIPStreams[x.stream], GPUCA_CONSMEM_CALL y.start, args...); \
84+
} else { \
85+
hipExtLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->HIPStreams[x.stream], *start, *stop, 0, GPUCA_CONSMEM_CALL y.start, args...); \
86+
}
87+
#define GPUCA_KRNL_CALL_multi(x_class, x_attributes, x_arguments, x_forward) \
88+
if (start == nullptr) { \
89+
hipLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT3(krnl_, GPUCA_M_KRNL_NAME(x_class), _multi)), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->HIPStreams[x.stream], GPUCA_CONSMEM_CALL y.start, y.num, args...); \
90+
} else { \
91+
hipExtLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT3(krnl_, GPUCA_M_KRNL_NAME(x_class), _multi)), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->HIPStreams[x.stream], *start, *stop, 0, GPUCA_CONSMEM_CALL y.start, y.num, args...); \
92+
}
8393
#include "GPUReconstructionKernels.h"
8494
#undef GPUCA_KRNL
8595

@@ -97,17 +107,15 @@ int GPUReconstructionHIPBackend::runKernelBackend(krnlSetup& _xyz, Args... args)
97107
if (mDeviceProcessingSettings.deviceTimers) {
98108
GPUFailedMsg(hipEventCreate(&start));
99109
GPUFailedMsg(hipEventCreate(&stop));
100-
GPUFailedMsg(hipEventRecord(start, mInternals->HIPStreams[x.stream]));
101-
}
102-
backendInternal<T, I>::runKernelBackendInternal(_xyz, this, args...);
103-
if (mDeviceProcessingSettings.deviceTimers) {
104-
GPUFailedMsg(hipEventRecord(stop, mInternals->HIPStreams[x.stream]));
110+
backendInternal<T, I>::runKernelBackendInternal(_xyz, this, &start, &stop, args...);
105111
GPUFailedMsg(hipEventSynchronize(stop));
106112
float v;
107113
GPUFailedMsg(hipEventElapsedTime(&v, start, stop));
108114
_xyz.t = v * 1.e-3;
109115
GPUFailedMsg(hipEventDestroy(start));
110116
GPUFailedMsg(hipEventDestroy(stop));
117+
} else {
118+
backendInternal<T, I>::runKernelBackendInternal(_xyz, this, nullptr, nullptr, args...);
111119
}
112120
GPUFailedMsg(hipGetLastError());
113121
if (z.ev) {

0 commit comments

Comments
 (0)