From 2782ec67419cb4b589cd3adf9fbfd727f15a598e Mon Sep 17 00:00:00 2001 From: xunyoyo <1279416582@qq.com> Date: Fri, 30 Jan 2026 14:38:14 +0800 Subject: [PATCH 1/6] Add marlin MoE backend tests --- .../test_fused_moe_marlin_backend.py | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 tests/model_executor/test_fused_moe_marlin_backend.py diff --git a/tests/model_executor/test_fused_moe_marlin_backend.py b/tests/model_executor/test_fused_moe_marlin_backend.py new file mode 100644 index 00000000000..15bbef12271 --- /dev/null +++ b/tests/model_executor/test_fused_moe_marlin_backend.py @@ -0,0 +1,73 @@ +""" +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from types import SimpleNamespace + +import paddle + +from fastdeploy.model_executor.layers.moe import ( + fused_moe_marlin_backend as marlin_backend, +) + +paddle.set_device("gpu") + + +class _DummyLayer(paddle.nn.Layer): + def __init__(self, hidden_size=32, moe_intermediate_size=16, topk_method="topk"): + super().__init__() + self.num_local_experts = 1 + self.num_experts = 1 + self.hidden_size = hidden_size + self.moe_intermediate_size = moe_intermediate_size + self.top_k = 1 + self.n_group = 1 + self.topk_group = 1 + self.topk_method = topk_method + self.routed_scaling_factor = 1.0 + self.gate_correction_bias = paddle.zeros([self.num_experts], dtype="float32") + self.renormalize = True + self.fd_config = SimpleNamespace() + + def extract_moe_ffn_weights(self, state_dict): + return state_dict["up"], state_dict["down"], None, None + + +def _make_weights(layer): + up = [paddle.ones([layer.hidden_size, layer.moe_intermediate_size * 2], dtype="float16")] + down = [paddle.ones([layer.moe_intermediate_size, layer.hidden_size], dtype="float16")] + return up, down + + +def test_marlin_process_and_apply_paths(): + method = marlin_backend.MarlinWeightOnlyMoEMethod() + layer = _DummyLayer() + + method.create_weights(layer) + up, down = _make_weights(layer) + method.process_loaded_weights(layer, {"up": up, "down": down}) + + scales = paddle.arange(64, dtype="float32").reshape([2, 32]) + permuted = marlin_backend.marlin_permute_scales(scales, size_k=16, size_n=32, group_size=8) + assert permuted.shape == [2, 32] + + gate = paddle.nn.Linear(layer.hidden_size, layer.num_experts, bias_attr=False) + x = paddle.ones([2, layer.hidden_size], dtype="float16") + out = method.apply(layer, x, gate, topk_ids_hookfunc=lambda **_k: None) + assert out.shape == [2, layer.hidden_size] + + layer.topk_method = "noaux_tc" + out_noaux = method.apply(layer, x, gate, topk_ids_hookfunc=lambda **_k: None) + assert out_noaux.shape == [2, layer.hidden_size] From bfd1d7fcc60078508f6a359283b6521b37a78103 Mon Sep 17 00:00:00 2001 From: xunyoyo <1279416582@qq.com> Date: Fri, 30 Jan 2026 21:48:47 +0800 Subject: [PATCH 2/6] Adjust marlin test shapes for repack --- tests/model_executor/test_fused_moe_marlin_backend.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/model_executor/test_fused_moe_marlin_backend.py b/tests/model_executor/test_fused_moe_marlin_backend.py index 15bbef12271..4dbb5609e5c 100644 --- a/tests/model_executor/test_fused_moe_marlin_backend.py +++ b/tests/model_executor/test_fused_moe_marlin_backend.py @@ -26,7 +26,7 @@ class _DummyLayer(paddle.nn.Layer): - def __init__(self, hidden_size=32, moe_intermediate_size=16, topk_method="topk"): + def __init__(self, hidden_size=64, moe_intermediate_size=32, topk_method="topk"): super().__init__() self.num_local_experts = 1 self.num_experts = 1 @@ -59,9 +59,9 @@ def test_marlin_process_and_apply_paths(): up, down = _make_weights(layer) method.process_loaded_weights(layer, {"up": up, "down": down}) - scales = paddle.arange(64, dtype="float32").reshape([2, 32]) - permuted = marlin_backend.marlin_permute_scales(scales, size_k=16, size_n=32, group_size=8) - assert permuted.shape == [2, 32] + scales = paddle.arange(128, dtype="float32").reshape([2, 64]) + permuted = marlin_backend.marlin_permute_scales(scales, size_k=16, size_n=64, group_size=8) + assert permuted.shape == [2, 64] gate = paddle.nn.Linear(layer.hidden_size, layer.num_experts, bias_attr=False) x = paddle.ones([2, layer.hidden_size], dtype="float16") From 495a5e00892f018c35508c9c9aac507dc84ff78e Mon Sep 17 00:00:00 2001 From: xunyoyo <1279416582@qq.com> Date: Fri, 30 Jan 2026 23:35:17 +0800 Subject: [PATCH 3/6] Align marlin scale dtype with weights --- tests/model_executor/test_fused_moe_marlin_backend.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/model_executor/test_fused_moe_marlin_backend.py b/tests/model_executor/test_fused_moe_marlin_backend.py index 4dbb5609e5c..cb95211292d 100644 --- a/tests/model_executor/test_fused_moe_marlin_backend.py +++ b/tests/model_executor/test_fused_moe_marlin_backend.py @@ -55,7 +55,10 @@ def test_marlin_process_and_apply_paths(): method = marlin_backend.MarlinWeightOnlyMoEMethod() layer = _DummyLayer() + prev_dtype = paddle.get_default_dtype() + paddle.set_default_dtype("float16") method.create_weights(layer) + paddle.set_default_dtype(prev_dtype) up, down = _make_weights(layer) method.process_loaded_weights(layer, {"up": up, "down": down}) From bcfa2fd596035e2ba5ce08ce112a0f15b9cd1e48 Mon Sep 17 00:00:00 2001 From: xunyoyo <1279416582@qq.com> Date: Sat, 31 Jan 2026 10:41:13 +0800 Subject: [PATCH 4/6] Use 2 experts in marlin test --- .../test_fused_moe_marlin_backend.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/model_executor/test_fused_moe_marlin_backend.py b/tests/model_executor/test_fused_moe_marlin_backend.py index cb95211292d..f4cb85ad490 100644 --- a/tests/model_executor/test_fused_moe_marlin_backend.py +++ b/tests/model_executor/test_fused_moe_marlin_backend.py @@ -26,10 +26,10 @@ class _DummyLayer(paddle.nn.Layer): - def __init__(self, hidden_size=64, moe_intermediate_size=32, topk_method="topk"): + def __init__(self, hidden_size=64, moe_intermediate_size=32, topk_method="topk", num_local_experts=2): super().__init__() - self.num_local_experts = 1 - self.num_experts = 1 + self.num_local_experts = num_local_experts + self.num_experts = num_local_experts self.hidden_size = hidden_size self.moe_intermediate_size = moe_intermediate_size self.top_k = 1 @@ -46,8 +46,14 @@ def extract_moe_ffn_weights(self, state_dict): def _make_weights(layer): - up = [paddle.ones([layer.hidden_size, layer.moe_intermediate_size * 2], dtype="float16")] - down = [paddle.ones([layer.moe_intermediate_size, layer.hidden_size], dtype="float16")] + up = [ + paddle.ones([layer.hidden_size, layer.moe_intermediate_size * 2], dtype="float16") + for _ in range(layer.num_local_experts) + ] + down = [ + paddle.ones([layer.moe_intermediate_size, layer.hidden_size], dtype="float16") + for _ in range(layer.num_local_experts) + ] return up, down From 1f25d9b0dd3fdc46abd66796050d23304789e10a Mon Sep 17 00:00:00 2001 From: xunyoyo <1279416582@qq.com> Date: Sat, 31 Jan 2026 12:44:57 +0800 Subject: [PATCH 5/6] Stub marlin gemm op in test --- tests/model_executor/test_fused_moe_marlin_backend.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/model_executor/test_fused_moe_marlin_backend.py b/tests/model_executor/test_fused_moe_marlin_backend.py index f4cb85ad490..002202c3dc2 100644 --- a/tests/model_executor/test_fused_moe_marlin_backend.py +++ b/tests/model_executor/test_fused_moe_marlin_backend.py @@ -57,7 +57,7 @@ def _make_weights(layer): return up, down -def test_marlin_process_and_apply_paths(): +def test_marlin_process_and_apply_paths(monkeypatch): method = marlin_backend.MarlinWeightOnlyMoEMethod() layer = _DummyLayer() @@ -74,6 +74,13 @@ def test_marlin_process_and_apply_paths(): gate = paddle.nn.Linear(layer.hidden_size, layer.num_experts, bias_attr=False) x = paddle.ones([2, layer.hidden_size], dtype="float16") + monkeypatch.setattr( + marlin_backend, + "MoeWna16MarlinGemmApi", + lambda *_args, **kwargs: ( + paddle.zeros([kwargs["size_m"], kwargs["size_n"]], dtype=x.dtype), + ), + ) out = method.apply(layer, x, gate, topk_ids_hookfunc=lambda **_k: None) assert out.shape == [2, layer.hidden_size] From 3f8dd2b48f4f4b8e9b15dc94e66581201f1a1474 Mon Sep 17 00:00:00 2001 From: xunyoyo <1279416582@qq.com> Date: Sat, 31 Jan 2026 14:53:33 +0800 Subject: [PATCH 6/6] Format marlin backend test --- tests/model_executor/test_fused_moe_marlin_backend.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/model_executor/test_fused_moe_marlin_backend.py b/tests/model_executor/test_fused_moe_marlin_backend.py index 002202c3dc2..97aef099b01 100644 --- a/tests/model_executor/test_fused_moe_marlin_backend.py +++ b/tests/model_executor/test_fused_moe_marlin_backend.py @@ -77,9 +77,7 @@ def test_marlin_process_and_apply_paths(monkeypatch): monkeypatch.setattr( marlin_backend, "MoeWna16MarlinGemmApi", - lambda *_args, **kwargs: ( - paddle.zeros([kwargs["size_m"], kwargs["size_n"]], dtype=x.dtype), - ), + lambda *_args, **kwargs: (paddle.zeros([kwargs["size_m"], kwargs["size_n"]], dtype=x.dtype),), ) out = method.apply(layer, x, gate, topk_ids_hookfunc=lambda **_k: None) assert out.shape == [2, layer.hidden_size]