[Models]Add support for Kimi-K2.5 BF16 inference in FastDeploy#6922

Open

Linboyan-trc wants to merge 3 commits intoPaddlePaddle:developfrom

Linboyan-trc:kimi_k25_bf16

Linboyan-trc commented Mar 18, 2026

Motivation

Support Kimi-K2.5 BF16 inference.

Modifications

Add model, weight loading, and tokenizer adaptation for Kimi-K2.5.

Accuracy Tests

Inference runs successfully with normal outputs.

Linboyan-trc added 3 commits

March 18, 2026 20:48


          add kimi_k25_bf16 model

516cb05


          add kimi_k25_bf16 model

bfdef21


          add kimi_k25_bf16 model

699e479

Copilot AI review requested due to automatic review settings

March 18, 2026 13:33

Linboyan-trc temporarily deployed to Metax_ci

March 18, 2026 13:33

— with

GitHub Actions Inactive

paddle-bot bot commented Mar 18, 2026

Thanks for your contribution!

paddle-bot bot added the contributor label

Copilot started reviewing on behalf of Linboyan-trc

March 18, 2026 13:34

Copilot AI reviewed

View reviewed changes

Contributor

Copilot AI left a comment

Pull request overview

该 PR 旨在为 FastDeploy 新增 Kimi-K2.5 的 BF16 推理支持，通过引入新的模型实现并复用 DeepseekV3 的推理主体能力来完成权重加载与注册。

Changes:

新增 KimiK25ForConditionalGeneration 模型类并注册到 ModelRegistry。
为 Kimi-K2.5 checkpoint 适配权重名映射与 MoE expert 权重加载流程。
新增对应的 PretrainedModel（用于 arch 注册/加载对接）。

fastdeploy/model_executor/models/kimi_k25.py

Comment on lines +1 to +6

+              """
+              # Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+              #
+              # Licensed under the Apache License, Version 2.0 (the "License");
+              # you may not use this file except in compliance with the License.
+              # You may obtain a copy of the License at

fastdeploy/model_executor/models/kimi_k25.py

Comment on lines +31 to +37

+              @ModelRegistry.register_model_class(
+                  architecture="KimiK25ForConditionalGeneration",
+                  module_name="kimi_k25",
+                  category=ModelCategory.TEXT_GENERATION,
+                  primary_use=ModelCategory.TEXT_GENERATION,
+              )
+              class KimiK25ForConditionalGeneration(DeepseekV3ForCausalLM):

fastdeploy/model_executor/models/kimi_k25.py

Comment on lines +51 to +120

+                      from fastdeploy.model_executor.utils import (
+                          default_weight_loader,
+                          process_weights_after_loading,
+                      )
+                      stacked_params_mapping = [
+                          # (param_name, shard_name, shard_id)
+                          ("up_gate_proj", "gate_proj", "gate"),
+                          ("up_gate_proj", "up_proj", "up"),
+                          ("embed_tokens.embeddings", "embed_tokens", None),
+                          ("lm_head.linear", "language_model.lm_head", None),
+                          ("experts.gate_correction_bias", "gate.e_score_correction_bias", None),
+                          ("qkv_a_proj_with_mqa", "q_a_proj", "q_a"),
+                          ("qkv_a_proj_with_mqa", "kv_a_proj_with_mqa", "kv_a"),
+                      ]
+                      # (param_name, weight_name, expert_id, shard_id)
+                      expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                          num_experts=self.fd_config.model_config.n_routed_experts,
+                          ckpt_gate_proj_name="gate_proj",
+                          ckpt_down_proj_name="down_proj",
+                          ckpt_up_proj_name="up_proj",
+                          param_gate_up_proj_name="experts.up_gate_proj_",
+                          param_down_proj_name="experts.down_proj_",
+                      )
+                      params_dict = dict(self.named_parameters())
+                      process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config)
+                      for loaded_weight_name, loaded_weight in weights_iterator:
+                          logger.debug(f"Loading weight: {loaded_weight_name}")
+                          loaded_weight_name = loaded_weight_name.replace("language_model.model", "model")
+                          for param_name, weight_name, shard_id in stacked_params_mapping:
+                              if weight_name not in loaded_weight_name:
+                                  continue
+                              if "mlp.experts." in loaded_weight_name:
+                                  continue
+                              model_param_name = loaded_weight_name.replace(weight_name, param_name)
+                              if model_param_name not in params_dict:
+                                  continue
+                              param = params_dict[model_param_name]
+                              weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config))
+                              weight_loader(param, loaded_weight, shard_id)
+                              break
+                          else:
+                              for mapping in expert_params_mapping:
+                                  param_name, weight_name, expert_id, shard_id = mapping
+                                  if weight_name not in loaded_weight_name:
+                                      continue
+                                  model_param_name = loaded_weight_name.replace(weight_name, param_name)
+                                  if model_param_name not in params_dict:
+                                      continue
+                                  param = params_dict[model_param_name]
+                                  weight_loader = param.weight_loader
+                                  weight_loader(param, loaded_weight, shard_id=shard_id, expert_id=expert_id)
+                                  break
+                              else:
+                                  model_param_name = loaded_weight_name
+                                  if model_param_name not in params_dict:
+                                      continue
+                                  param = params_dict[model_param_name]
+                                  weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config))
+                                  weight_loader(param, loaded_weight)
+                          model_sublayer_name = re.sub(r"\.(up_gate_proj_weight|down_proj_weight|weight)$", "", model_param_name)
+                          if "kv_b_proj" in model_sublayer_name:
+                              kv_model_sublayer_name = model_sublayer_name.replace("kv_b_proj", "kv_b_proj_bmm")
+                              process_weights_after_loading_fn(kv_model_sublayer_name)
+                          process_weights_after_loading_fn(model_sublayer_name, param)

fastdeploy/model_executor/models/kimi_k25.py

Comment on lines +25 to +56

+              from fastdeploy.config import FDConfig
+              from fastdeploy.model_executor.models.model_base import (ModelCategory, ModelRegistry)
+              from fastdeploy.model_executor.layers.moe.moe import FusedMoE
+              from fastdeploy.model_executor.models.deepseek_v3 import DeepseekV3ForCausalLM
+              @ModelRegistry.register_model_class(
+                  architecture="KimiK25ForConditionalGeneration",
+                  module_name="kimi_k25",
+                  category=ModelCategory.TEXT_GENERATION,
+                  primary_use=ModelCategory.TEXT_GENERATION,
+              )
+              class KimiK25ForConditionalGeneration(DeepseekV3ForCausalLM):
+                  """
+                  KimiK25ForConditionalGeneration
+                  """
+                  def __init__(self, fd_config: FDConfig):
+                      super().__init__(fd_config)
+                  @classmethod
+                  def name(cls):
+                      return "KimiK25ForConditionalGeneration"
+                  @paddle.no_grad()
+                  def load_weights(self, weights_iterator) -> None:
+                      from fastdeploy.model_executor.utils import (
+                          default_weight_loader,
+                          process_weights_after_loading,
+                      )
+                      stacked_params_mapping = [

fastdeploy/model_executor/models/kimi_k25.py

Comment on lines +1 to +2

		"""
		# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels