From 8b77d1ccaf98b1f4409aa7f298db9131d3552148 Mon Sep 17 00:00:00 2001 From: tongliu Date: Thu, 8 Jan 2026 19:16:06 -0800 Subject: [PATCH 1/2] fix router --- tests/pytorch/test_fused_router.py | 2 +- .../common/fused_router/fused_score_for_moe_aux_loss.cu | 2 ++ .../common/fused_router/fused_topk_with_score_function.cu | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/pytorch/test_fused_router.py b/tests/pytorch/test_fused_router.py index fa134ba4bd..5522911e4b 100644 --- a/tests/pytorch/test_fused_router.py +++ b/tests/pytorch/test_fused_router.py @@ -402,7 +402,7 @@ def profile_topk_softmax( test_topk_softmax( dtype=torch.float32, num_tokens=1024, - num_experts=128, + num_experts=3000, topk=4, use_pre_softmax=False, group_topk=None, diff --git a/transformer_engine/common/fused_router/fused_score_for_moe_aux_loss.cu b/transformer_engine/common/fused_router/fused_score_for_moe_aux_loss.cu index 03d22942b5..cc5afec313 100644 --- a/transformer_engine/common/fused_router/fused_score_for_moe_aux_loss.cu +++ b/transformer_engine/common/fused_router/fused_score_for_moe_aux_loss.cu @@ -147,6 +147,7 @@ void fused_score_for_moe_aux_loss_forward_kernel_launcher( size_t shared_memory_size = num_experts * num_token_per_block * sizeof(DataType) // logits + topk * num_token_per_block * sizeof(DataType) // topk_logits + topk * num_token_per_block * sizeof(int); // topk_indices + cudaFuncSetAttribute(fused_score_for_moe_aux_loss_forward_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); fused_score_for_moe_aux_loss_forward_kernel <<>>( logits, num_tokens, num_experts, topk, score_function, scores, routing_map, @@ -283,6 +284,7 @@ void fused_score_for_moe_aux_loss_backward_kernel_launcher( + num_experts * num_token_per_block * sizeof(DataType) // act_from_fwd + num_experts * num_token_per_block * sizeof(DataType); // comp_buf + cudaFuncSetAttribute(fused_score_for_moe_aux_loss_backward_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); fused_score_for_moe_aux_loss_backward_kernel <<>>( intermediate_output, grad_scores, num_tokens, num_experts, topk, score_function, diff --git a/transformer_engine/common/fused_router/fused_topk_with_score_function.cu b/transformer_engine/common/fused_router/fused_topk_with_score_function.cu index 03e972332a..fac2ce3425 100644 --- a/transformer_engine/common/fused_router/fused_topk_with_score_function.cu +++ b/transformer_engine/common/fused_router/fused_topk_with_score_function.cu @@ -253,6 +253,7 @@ void fused_topk_with_score_function_forward_kernel_launcher( shared_memory_size += num_groups * num_token_per_block * sizeof(DataType); // group_scores shared_memory_size += num_experts * num_token_per_block * sizeof(DataType); // maksed_scores } + cudaFuncSetAttribute(fused_topk_with_score_function_forward_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); fused_topk_with_score_function_forward_kernel <<>>( logits, num_tokens, num_experts, topk, use_pre_softmax, num_groups, group_topk, @@ -444,6 +445,7 @@ void fused_topk_with_score_function_backward_kernel_launcher( num_experts * num_token_per_block * sizeof(DataType) // act_from_fwd + num_experts * num_token_per_block * sizeof(DataType) // comp_buf + num_experts * num_token_per_block * sizeof(bool); // routing_map + cudaFuncSetAttribute(fused_topk_with_score_function_backward_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); fused_topk_with_score_function_backward_kernel <<>>( routing_map, intermediate_output, grad_probs, num_tokens, num_experts, topk, From 273a247c6dcdb7bebcb08db550348ee52d4f5b91 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 9 Jan 2026 03:21:18 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../common/fused_router/fused_score_for_moe_aux_loss.cu | 6 ++++-- .../common/fused_router/fused_topk_with_score_function.cu | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/transformer_engine/common/fused_router/fused_score_for_moe_aux_loss.cu b/transformer_engine/common/fused_router/fused_score_for_moe_aux_loss.cu index cc5afec313..197c662d7b 100644 --- a/transformer_engine/common/fused_router/fused_score_for_moe_aux_loss.cu +++ b/transformer_engine/common/fused_router/fused_score_for_moe_aux_loss.cu @@ -147,7 +147,8 @@ void fused_score_for_moe_aux_loss_forward_kernel_launcher( size_t shared_memory_size = num_experts * num_token_per_block * sizeof(DataType) // logits + topk * num_token_per_block * sizeof(DataType) // topk_logits + topk * num_token_per_block * sizeof(int); // topk_indices - cudaFuncSetAttribute(fused_score_for_moe_aux_loss_forward_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); + cudaFuncSetAttribute(fused_score_for_moe_aux_loss_forward_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); fused_score_for_moe_aux_loss_forward_kernel <<>>( logits, num_tokens, num_experts, topk, score_function, scores, routing_map, @@ -284,7 +285,8 @@ void fused_score_for_moe_aux_loss_backward_kernel_launcher( + num_experts * num_token_per_block * sizeof(DataType) // act_from_fwd + num_experts * num_token_per_block * sizeof(DataType); // comp_buf - cudaFuncSetAttribute(fused_score_for_moe_aux_loss_backward_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); + cudaFuncSetAttribute(fused_score_for_moe_aux_loss_backward_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); fused_score_for_moe_aux_loss_backward_kernel <<>>( intermediate_output, grad_scores, num_tokens, num_experts, topk, score_function, diff --git a/transformer_engine/common/fused_router/fused_topk_with_score_function.cu b/transformer_engine/common/fused_router/fused_topk_with_score_function.cu index fac2ce3425..2ec497188a 100644 --- a/transformer_engine/common/fused_router/fused_topk_with_score_function.cu +++ b/transformer_engine/common/fused_router/fused_topk_with_score_function.cu @@ -253,7 +253,8 @@ void fused_topk_with_score_function_forward_kernel_launcher( shared_memory_size += num_groups * num_token_per_block * sizeof(DataType); // group_scores shared_memory_size += num_experts * num_token_per_block * sizeof(DataType); // maksed_scores } - cudaFuncSetAttribute(fused_topk_with_score_function_forward_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); + cudaFuncSetAttribute(fused_topk_with_score_function_forward_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); fused_topk_with_score_function_forward_kernel <<>>( logits, num_tokens, num_experts, topk, use_pre_softmax, num_groups, group_topk, @@ -445,7 +446,8 @@ void fused_topk_with_score_function_backward_kernel_launcher( num_experts * num_token_per_block * sizeof(DataType) // act_from_fwd + num_experts * num_token_per_block * sizeof(DataType) // comp_buf + num_experts * num_token_per_block * sizeof(bool); // routing_map - cudaFuncSetAttribute(fused_topk_with_score_function_backward_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); + cudaFuncSetAttribute(fused_topk_with_score_function_backward_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); fused_topk_with_score_function_backward_kernel <<>>( routing_map, intermediate_output, grad_probs, num_tokens, num_experts, topk,