From 6415b36bcc7ab2971bc86bdb134d49be6c39e64e Mon Sep 17 00:00:00 2001
From: bolunz <chamberlain0w0@gmail.com>
Date: Fri, 22 May 2026 15:08:27 +0800
Subject: [PATCH 1/3] fix: fix cast kernel count

---
 infini_train/include/dispatcher.h | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/infini_train/include/dispatcher.h b/infini_train/include/dispatcher.h
index 29d11b73..638df76a 100644
--- a/infini_train/include/dispatcher.h
+++ b/infini_train/include/dispatcher.h
@@ -74,6 +74,9 @@ class Dispatcher {
     template <typename RetT, class... ArgsT> RetT Call(KeyT key, ArgsT... args) const {
         auto kernel = this->GetKernel(key);
         tls_autocast_context.Autocast(key, args...);
+#ifdef PROFILE_MODE
+        SetProfileContext(key.second, key.first);
+#endif
         return kernel.Call<RetT>(std::forward<ArgsT>(args)...);
     }
 

From 709d4476283ab3aac570a601409ea769dcf28d7d Mon Sep 17 00:00:00 2001
From: bolunz <chamberlain0w0@gmail.com>
Date: Tue, 26 May 2026 10:41:49 +0800
Subject: [PATCH 2/3] fix: fix ReduceAddCoalesced kernel to make sure profiler
 works correctly

---
 infini_train/src/kernels/cuda/comm.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/infini_train/src/kernels/cuda/comm.cu b/infini_train/src/kernels/cuda/comm.cu
index 6ccad9e4..9e3fa3a2 100644
--- a/infini_train/src/kernels/cuda/comm.cu
+++ b/infini_train/src/kernels/cuda/comm.cu
@@ -25,7 +25,6 @@ std::vector<std::shared_ptr<Tensor>> Broadcast(const std::vector<std::shared_ptr
 std::vector<std::shared_ptr<Tensor>> ReduceAddCoalesced(const std::vector<std::vector<std::shared_ptr<Tensor>>> &grads,
                                                         Device destination) {
     std::vector<std::shared_ptr<Tensor>> outputs;
-    auto kernel = Dispatcher::Instance().GetKernel({destination.type(), "AccumulateGrad"});
     std::vector<std::vector<std::shared_ptr<Tensor>>> to_destination_grads;
     for (int i = 0; i < grads[0].size(); ++i) {
         outputs.emplace_back(std::make_shared<Tensor>(grads[0][i]->Dims(), grads[0][i]->Dtype(), destination));
@@ -37,6 +36,9 @@ std::vector<std::shared_ptr<Tensor>> ReduceAddCoalesced(const std::vector<std::v
             to_destination_grads[i].push_back(std::make_shared<Tensor>(grads[i][j]->To(destination)));
         }
     }
+    // NOTE(zbl): To ensure Profiler works correctly, there should not be any other kernel calls 
+    //            between GetKernel and kernel.Call, otherwise ProfileContext would be tainted
+    auto kernel = Dispatcher::Instance().GetKernel({destination.type(), "AccumulateGrad"});
     for (int i = 0; i < grads.size(); ++i) {
         for (int j = 0; j < grads[i].size(); ++j) {
             kernel.Call<void>(to_destination_grads[i][j], static_cast<float>(1.0), outputs[j]);

From e8c51e4d250eb473776fc7e223d1ff8c547c42b6 Mon Sep 17 00:00:00 2001
From: bolunz <chamberlain0w0@gmail.com>
Date: Tue, 26 May 2026 10:43:11 +0800
Subject: [PATCH 3/3] style: format comments

---
 infini_train/src/kernels/cuda/comm.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/infini_train/src/kernels/cuda/comm.cu b/infini_train/src/kernels/cuda/comm.cu
index 9e3fa3a2..b4bdafd8 100644
--- a/infini_train/src/kernels/cuda/comm.cu
+++ b/infini_train/src/kernels/cuda/comm.cu
@@ -36,7 +36,7 @@ std::vector<std::shared_ptr<Tensor>> ReduceAddCoalesced(const std::vector<std::v
             to_destination_grads[i].push_back(std::make_shared<Tensor>(grads[i][j]->To(destination)));
         }
     }
-    // NOTE(zbl): To ensure Profiler works correctly, there should not be any other kernel calls 
+    // NOTE(zbl): To ensure Profiler works correctly, there should not be any other kernel calls
     //            between GetKernel and kernel.Call, otherwise ProfileContext would be tainted
     auto kernel = Dispatcher::Instance().GetKernel({destination.type(), "AccumulateGrad"});
     for (int i = 0; i < grads.size(); ++i) {