[Models]Add support for Kimi-K2.5 BF16 inference in FastDeploy#6922
Open
Linboyan-trc wants to merge 3 commits intoPaddlePaddle:developfrom
Open
[Models]Add support for Kimi-K2.5 BF16 inference in FastDeploy#6922Linboyan-trc wants to merge 3 commits intoPaddlePaddle:developfrom
Linboyan-trc wants to merge 3 commits intoPaddlePaddle:developfrom
Conversation
|
Thanks for your contribution! |
Contributor
There was a problem hiding this comment.
Pull request overview
该 PR 旨在为 FastDeploy 新增 Kimi-K2.5 的 BF16 推理支持,通过引入新的模型实现并复用 DeepseekV3 的推理主体能力来完成权重加载与注册。
Changes:
- 新增
KimiK25ForConditionalGeneration模型类并注册到ModelRegistry。 - 为 Kimi-K2.5 checkpoint 适配权重名映射与 MoE expert 权重加载流程。
- 新增对应的
PretrainedModel(用于 arch 注册/加载对接)。
Comment on lines
+1
to
+6
| """ | ||
| # Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at |
Comment on lines
+31
to
+37
| @ModelRegistry.register_model_class( | ||
| architecture="KimiK25ForConditionalGeneration", | ||
| module_name="kimi_k25", | ||
| category=ModelCategory.TEXT_GENERATION, | ||
| primary_use=ModelCategory.TEXT_GENERATION, | ||
| ) | ||
| class KimiK25ForConditionalGeneration(DeepseekV3ForCausalLM): |
Comment on lines
+51
to
+120
| from fastdeploy.model_executor.utils import ( | ||
| default_weight_loader, | ||
| process_weights_after_loading, | ||
| ) | ||
|
|
||
| stacked_params_mapping = [ | ||
| # (param_name, shard_name, shard_id) | ||
| ("up_gate_proj", "gate_proj", "gate"), | ||
| ("up_gate_proj", "up_proj", "up"), | ||
| ("embed_tokens.embeddings", "embed_tokens", None), | ||
| ("lm_head.linear", "language_model.lm_head", None), | ||
| ("experts.gate_correction_bias", "gate.e_score_correction_bias", None), | ||
| ("qkv_a_proj_with_mqa", "q_a_proj", "q_a"), | ||
| ("qkv_a_proj_with_mqa", "kv_a_proj_with_mqa", "kv_a"), | ||
| ] | ||
| # (param_name, weight_name, expert_id, shard_id) | ||
| expert_params_mapping = FusedMoE.make_expert_params_mapping( | ||
| num_experts=self.fd_config.model_config.n_routed_experts, | ||
| ckpt_gate_proj_name="gate_proj", | ||
| ckpt_down_proj_name="down_proj", | ||
| ckpt_up_proj_name="up_proj", | ||
| param_gate_up_proj_name="experts.up_gate_proj_", | ||
| param_down_proj_name="experts.down_proj_", | ||
| ) | ||
| params_dict = dict(self.named_parameters()) | ||
| process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config) | ||
| for loaded_weight_name, loaded_weight in weights_iterator: | ||
| logger.debug(f"Loading weight: {loaded_weight_name}") | ||
| loaded_weight_name = loaded_weight_name.replace("language_model.model", "model") | ||
| for param_name, weight_name, shard_id in stacked_params_mapping: | ||
| if weight_name not in loaded_weight_name: | ||
| continue | ||
| if "mlp.experts." in loaded_weight_name: | ||
| continue | ||
| model_param_name = loaded_weight_name.replace(weight_name, param_name) | ||
|
|
||
| if model_param_name not in params_dict: | ||
| continue | ||
|
|
||
| param = params_dict[model_param_name] | ||
| weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config)) | ||
| weight_loader(param, loaded_weight, shard_id) | ||
| break | ||
| else: | ||
| for mapping in expert_params_mapping: | ||
| param_name, weight_name, expert_id, shard_id = mapping | ||
| if weight_name not in loaded_weight_name: | ||
| continue | ||
| model_param_name = loaded_weight_name.replace(weight_name, param_name) | ||
| if model_param_name not in params_dict: | ||
| continue | ||
| param = params_dict[model_param_name] | ||
| weight_loader = param.weight_loader | ||
| weight_loader(param, loaded_weight, shard_id=shard_id, expert_id=expert_id) | ||
| break | ||
| else: | ||
| model_param_name = loaded_weight_name | ||
| if model_param_name not in params_dict: | ||
| continue | ||
| param = params_dict[model_param_name] | ||
| weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config)) | ||
| weight_loader(param, loaded_weight) | ||
|
|
||
| model_sublayer_name = re.sub(r"\.(up_gate_proj_weight|down_proj_weight|weight)$", "", model_param_name) | ||
| if "kv_b_proj" in model_sublayer_name: | ||
| kv_model_sublayer_name = model_sublayer_name.replace("kv_b_proj", "kv_b_proj_bmm") | ||
| process_weights_after_loading_fn(kv_model_sublayer_name) | ||
| process_weights_after_loading_fn(model_sublayer_name, param) | ||
|
|
||
|
|
Comment on lines
+25
to
+56
| from fastdeploy.config import FDConfig | ||
| from fastdeploy.model_executor.models.model_base import (ModelCategory, ModelRegistry) | ||
| from fastdeploy.model_executor.layers.moe.moe import FusedMoE | ||
| from fastdeploy.model_executor.models.deepseek_v3 import DeepseekV3ForCausalLM | ||
|
|
||
|
|
||
| @ModelRegistry.register_model_class( | ||
| architecture="KimiK25ForConditionalGeneration", | ||
| module_name="kimi_k25", | ||
| category=ModelCategory.TEXT_GENERATION, | ||
| primary_use=ModelCategory.TEXT_GENERATION, | ||
| ) | ||
| class KimiK25ForConditionalGeneration(DeepseekV3ForCausalLM): | ||
| """ | ||
| KimiK25ForConditionalGeneration | ||
| """ | ||
|
|
||
| def __init__(self, fd_config: FDConfig): | ||
| super().__init__(fd_config) | ||
|
|
||
| @classmethod | ||
| def name(cls): | ||
| return "KimiK25ForConditionalGeneration" | ||
|
|
||
| @paddle.no_grad() | ||
| def load_weights(self, weights_iterator) -> None: | ||
| from fastdeploy.model_executor.utils import ( | ||
| default_weight_loader, | ||
| process_weights_after_loading, | ||
| ) | ||
|
|
||
| stacked_params_mapping = [ |
Comment on lines
+1
to
+2
| """ | ||
| # Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Motivation
Support Kimi-K2.5 BF16 inference.
Modifications
Add model, weight loading, and tokenizer adaptation for Kimi-K2.5.
Accuracy Tests
Inference runs successfully with normal outputs.