From 6415b36bcc7ab2971bc86bdb134d49be6c39e64e Mon Sep 17 00:00:00 2001 From: bolunz Date: Fri, 22 May 2026 15:08:27 +0800 Subject: [PATCH 1/3] fix: fix cast kernel count --- infini_train/include/dispatcher.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/infini_train/include/dispatcher.h b/infini_train/include/dispatcher.h index 29d11b73..638df76a 100644 --- a/infini_train/include/dispatcher.h +++ b/infini_train/include/dispatcher.h @@ -74,6 +74,9 @@ class Dispatcher { template RetT Call(KeyT key, ArgsT... args) const { auto kernel = this->GetKernel(key); tls_autocast_context.Autocast(key, args...); +#ifdef PROFILE_MODE + SetProfileContext(key.second, key.first); +#endif return kernel.Call(std::forward(args)...); } From 709d4476283ab3aac570a601409ea769dcf28d7d Mon Sep 17 00:00:00 2001 From: bolunz Date: Tue, 26 May 2026 10:41:49 +0800 Subject: [PATCH 2/3] fix: fix ReduceAddCoalesced kernel to make sure profiler works correctly --- infini_train/src/kernels/cuda/comm.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/infini_train/src/kernels/cuda/comm.cu b/infini_train/src/kernels/cuda/comm.cu index 6ccad9e4..9e3fa3a2 100644 --- a/infini_train/src/kernels/cuda/comm.cu +++ b/infini_train/src/kernels/cuda/comm.cu @@ -25,7 +25,6 @@ std::vector> Broadcast(const std::vector> ReduceAddCoalesced(const std::vector>> &grads, Device destination) { std::vector> outputs; - auto kernel = Dispatcher::Instance().GetKernel({destination.type(), "AccumulateGrad"}); std::vector>> to_destination_grads; for (int i = 0; i < grads[0].size(); ++i) { outputs.emplace_back(std::make_shared(grads[0][i]->Dims(), grads[0][i]->Dtype(), destination)); @@ -37,6 +36,9 @@ std::vector> ReduceAddCoalesced(const std::vector(grads[i][j]->To(destination))); } } + // NOTE(zbl): To ensure Profiler works correctly, there should not be any other kernel calls + // between GetKernel and kernel.Call, otherwise ProfileContext would be tainted + auto kernel = Dispatcher::Instance().GetKernel({destination.type(), "AccumulateGrad"}); for (int i = 0; i < grads.size(); ++i) { for (int j = 0; j < grads[i].size(); ++j) { kernel.Call(to_destination_grads[i][j], static_cast(1.0), outputs[j]); From e8c51e4d250eb473776fc7e223d1ff8c547c42b6 Mon Sep 17 00:00:00 2001 From: bolunz Date: Tue, 26 May 2026 10:43:11 +0800 Subject: [PATCH 3/3] style: format comments --- infini_train/src/kernels/cuda/comm.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infini_train/src/kernels/cuda/comm.cu b/infini_train/src/kernels/cuda/comm.cu index 9e3fa3a2..b4bdafd8 100644 --- a/infini_train/src/kernels/cuda/comm.cu +++ b/infini_train/src/kernels/cuda/comm.cu @@ -36,7 +36,7 @@ std::vector> ReduceAddCoalesced(const std::vector(grads[i][j]->To(destination))); } } - // NOTE(zbl): To ensure Profiler works correctly, there should not be any other kernel calls + // NOTE(zbl): To ensure Profiler works correctly, there should not be any other kernel calls // between GetKernel and kernel.Call, otherwise ProfileContext would be tainted auto kernel = Dispatcher::Instance().GetKernel({destination.type(), "AccumulateGrad"}); for (int i = 0; i < grads.size(); ++i) {