From c8b388845b80335e532f930ff09e8be41d050eaa Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 26 Mar 2026 08:17:31 +0000
Subject: [PATCH 01/51] qwen3_vl_moe support prefill_cudagraph

---
 .../layer_infer/transformer_layer_infer.py    | 39 ++++++++++++++++++-
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py
index 391ee8bf6b..40d4bbc0ad 100644
--- a/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py
@@ -1,12 +1,14 @@
 import torch
 import torch.distributed as dist
 from typing import Tuple
+from lightllm.common.basemodel.infer_struct import InferStateInfo
 from lightllm.models.qwen2_vl.triton_kernel.mrope import mrope_triton_fused
 from lightllm.models.qwen3_moe.layer_infer.transformer_layer_infer import Qwen3MOETransformerLayerInfer
 from lightllm.models.qwen3_moe.layer_weights.transformer_layer_weight import Qwen3MOETransformerLayerWeight
 from lightllm.models.qwen3_vl.infer_struct import Qwen3VLInferStateInfo
 from lightllm.distributed import all_reduce
 from lightllm.models.qwen3_vl.triton_kernel.deepstack_multimodal_emb import apply_deepstack_features
+from lightllm.utils.tensor_utils import tensor_to_no_ref_tensor
 
 
 class Qwen3VLMOETransformerLayerInfer(Qwen3MOETransformerLayerInfer):
@@ -48,7 +50,7 @@ def context_forward(self, input_embdings, infer_state: Qwen3VLInferStateInfo, la
         q, cache_kv = self._get_qkv(input1, infer_state, layer_weight)
         input1 = None
         self._post_cache_kv(cache_kv, infer_state, layer_weight)
-        o = self._context_attention_kernel(q, cache_kv, infer_state, layer_weight)
+        o = self._context_attention_wrapper_run(q, cache_kv, infer_state, layer_weight)
         q = None
         o = self._get_o(o, infer_state, layer_weight)
         if self.tp_world_size_ > 1:
@@ -62,9 +64,42 @@ def context_forward(self, input_embdings, infer_state: Qwen3VLInferStateInfo, la
         if self.tp_world_size_ > 1:
             all_reduce(ffn_out, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
         input_embdings.add_(ffn_out.view(-1, self.embed_dim_))
-        apply_deepstack_features(
+        self._apply_deepstack_features_wrapper_run(
             input_embeddings=input_embdings,
             infer_state=infer_state,
             layer_num=self.layer_num_,
         )
         return input_embdings
+
+    def _apply_deepstack_features_wrapper_run(
+        self,
+        input_embeddings: torch.Tensor,
+        infer_state: InferStateInfo,
+        layer_num: int,
+    ):
+        if torch.cuda.is_current_stream_capturing():
+            input_embeddings = input_embeddings.contiguous()
+            _input_embeddings = tensor_to_no_ref_tensor(input_embeddings)
+            pre_capture_graph = infer_state.prefill_cuda_graph_get_current_capture_graph()
+            pre_capture_graph.__exit__(None, None, None)
+
+            infer_state.prefill_cuda_graph_create_graph_obj()
+            infer_state.prefill_cuda_graph_get_current_capture_graph().__enter__()
+
+            def apply_func(new_infer_state: InferStateInfo):
+                apply_deepstack_features(
+                    input_embeddings=_input_embeddings,
+                    infer_state=new_infer_state,
+                    layer_num=layer_num,
+                )
+                return
+
+            infer_state.prefill_cuda_graph_add_cpu_runnning_func(func=apply_func, after_graph=pre_capture_graph)
+        else:
+            apply_deepstack_features(
+                input_embeddings=input_embeddings,
+                infer_state=infer_state,
+                layer_num=layer_num,
+            )
+
+        return

From e7fba3af30bb723dcf9909a6d06bbb9ff514134b Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Fri, 27 Mar 2026 05:17:57 +0000
Subject: [PATCH 02/51] add audio dp

---
 lightllm/server/api_cli.py                    |  4 ++
 lightllm/server/api_start.py                  | 21 +++++-
 lightllm/server/audioserver/manager.py        | 41 ++++++-----
 .../audioserver/model_infer/model_rpc.py      | 68 ++++++++++++++-----
 lightllm/server/core/objs/start_args_type.py  |  6 +-
 5 files changed, 105 insertions(+), 35 deletions(-)

diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
index d32da8097c..776fbc8247 100644
--- a/lightllm/server/api_cli.py
+++ b/lightllm/server/api_cli.py
@@ -426,6 +426,9 @@ def make_argument_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--visual_infer_batch_size", type=int, default=None, help="number of images to process in each inference batch"
     )
+    parser.add_argument(
+        "--audio_infer_batch_size", type=int, default=None, help="number of audios to process in each inference batch"
+    )
     parser.add_argument(
         "--visual_send_batch_size",
         type=int,
@@ -440,6 +443,7 @@ def make_argument_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument("--visual_tp", type=int, default=1, help="number of tensort parallel instances for ViT")
     parser.add_argument("--visual_dp", type=int, default=1, help="number of data parallel instances for ViT")
+    parser.add_argument("--audio_dp", type=int, default=1, help="number of data parallel instances for audio encoder")
     parser.add_argument(
         "--visual_nccl_ports",
         nargs="+",
diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
index 364f9ca281..180b16e658 100644
--- a/lightllm/server/api_start.py
+++ b/lightllm/server/api_start.py
@@ -188,6 +188,9 @@ def normal_or_p_d_start(args):
     if args.visual_dp <= 0:
         raise ValueError("visual_dp must be a positive integer.")
 
+    if args.audio_dp <= 0:
+        raise ValueError("audio_dp must be a positive integer.")
+
     if args.visual_infer_batch_size is None:
         args.visual_infer_batch_size = args.visual_dp
 
@@ -198,6 +201,15 @@ def normal_or_p_d_start(args):
             f"a positive integer multiple of visual_dp ({args.visual_dp})"
         )
 
+    if args.audio_infer_batch_size is None:
+        args.audio_infer_batch_size = args.audio_dp * 2
+
+    if args.audio_infer_batch_size // args.audio_dp < 1 or args.audio_infer_batch_size % args.audio_dp != 0:
+        raise ValueError(
+            f"audio_infer_batch_size ({args.audio_infer_batch_size}) must be "
+            f"a positive integer multiple of audio_dp ({args.audio_dp})"
+        )
+
     if args.disable_chunked_prefill:
         args.chunked_prefill_size = args.max_req_total_len
         # 普通模式下
@@ -247,8 +259,10 @@ def normal_or_p_d_start(args):
     ports_locker.lock_port()
 
     node_world_size = args.tp // args.nnodes
+    audio_model_dp_ports_num = 0 if args.disable_audio else args.audio_dp
     can_use_ports = alloc_can_use_network_port(
-        num=10 + node_world_size + args.visual_dp * (args.visual_tp + 1), used_ports=already_uesd_ports
+        num=10 + node_world_size + args.visual_dp * (args.visual_tp + 1) + audio_model_dp_ports_num,
+        used_ports=already_uesd_ports,
     )
     logger.info(f"alloced ports: {can_use_ports}")
     (
@@ -274,6 +288,9 @@ def normal_or_p_d_start(args):
         visual_nccl_ports.append(can_use_ports[0])
         can_use_ports = can_use_ports[1:]
 
+    audio_model_dp_ports = can_use_ports[0:audio_model_dp_ports_num]
+    can_use_ports = can_use_ports[audio_model_dp_ports_num:]
+
     # 将申请好的端口放入args参数中
     if args.nccl_port is None:
         args.nccl_port = nccl_port
@@ -342,7 +359,7 @@ def normal_or_p_d_start(args):
                 start_audio_process,
             ],
             start_args=[
-                (args,),
+                (args, audio_model_dp_ports),
             ],
         )
 
diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py
index bb0a745302..f7cb300aaf 100644
--- a/lightllm/server/audioserver/manager.py
+++ b/lightllm/server/audioserver/manager.py
@@ -26,7 +26,7 @@ class AudioManager:
     def __init__(
         self,
         args: StartArgs,
-        infer_batch_size=4,
+        audio_model_rpc_ports,
     ):
         context = zmq.asyncio.Context(2)
 
@@ -45,29 +45,32 @@ def __init__(
         self.waiting_reqs: List[GroupReqIndexes] = []
         self.model_weightdir = args.model_dir
         self.tp_world_size = args.tp
-        self.world_size = 1
-        self.infer_batch_size = infer_batch_size
+        self.audio_dp = args.audio_dp
+        self.infer_batch_size = args.audio_infer_batch_size
         self.trust_remote_code = args.trust_remote_code
         self.args = args
+        self.audio_model_rpc_ports = audio_model_rpc_ports or [None] * self.audio_dp
         self.shm_req_manager = ShmReqManager()
+        self.model_rpcs: List[AudioModelRpcClient] = []
 
     async def wait_to_model_ready(self):
-
-        self.model_rpcs: List[AudioModelRpcClient] = []
-        for rank_id in range(self.world_size):
-            rpc_model = await start_model_process(world_size=self.world_size)
+        self.model_rpcs = []
+        for dp_rank_id in range(self.audio_dp):
+            rpc_model = await start_model_process(
+                world_size=self.audio_dp, port=self.audio_model_rpc_ports[dp_rank_id], device_id=dp_rank_id
+            )
             self.model_rpcs.append(rpc_model)
 
         init_model_ret = []
-        for rank_id in range(self.world_size):
+        for dp_rank_id in range(self.audio_dp):
             kvargs = {
                 "weight_dir": self.model_weightdir,
                 "trust_remote_code": self.trust_remote_code,
-                "rank_id": rank_id,
+                "dp_rank_id": dp_rank_id,
                 "cache_port": self.cache_port,
                 "data_type": self.args.data_type,
             }
-            init_model_ret.append(self.model_rpcs[rank_id].init_model(kvargs))
+            init_model_ret.append(self.model_rpcs[dp_rank_id].init_model(kvargs))
         await asyncio.gather(*init_model_ret)
         return
 
@@ -75,7 +78,11 @@ async def infer_audios(self, audios: List[AudioItem]):
         if len(audios) == 0:
             return
 
-        rets = [self.model_rpcs[tp_rank].encode(audios) for tp_rank in range(self.world_size)]
+        rets = []
+        for dp_rank_id in range(self.audio_dp):
+            assigned_audios = [audios[i] for i in range(dp_rank_id, len(audios), self.audio_dp)]
+            if assigned_audios:
+                rets.append(self.model_rpcs[dp_rank_id].encode(assigned_audios))
         await asyncio.gather(*rets)
 
         return
@@ -148,19 +155,21 @@ async def loop_for_netio_req(self):
 
     def clean_up(self):
         for model_rpc in self.model_rpcs:
-            model_rpc.rpc_server_process.kill()
+            if model_rpc.rpc_server_process is not None:
+                model_rpc.rpc_server_process.kill()
         for model_rpc in self.model_rpcs:
-            model_rpc.rpc_server_process.join()
+            if model_rpc.rpc_server_process is not None:
+                model_rpc.rpc_server_process.join()
         return
 
 
-def start_audio_process(args, pipe_writer):
+def start_audio_process(args, model_rpc_ports, pipe_writer):
     # 注册graceful 退出的处理
     graceful_registry(inspect.currentframe().f_code.co_name)
     setproctitle.setproctitle(f"lightllm::{get_unique_server_name()}::audio_server")
 
+    audioserver = AudioManager(args=args, audio_model_rpc_ports=model_rpc_ports)
     try:
-        audioserver = AudioManager(args=args)
         asyncio.run(audioserver.wait_to_model_ready())
     except Exception as e:
         logger.exception(str(e))
@@ -170,7 +179,7 @@ def start_audio_process(args, pipe_writer):
     pipe_writer.send("init ok")
 
     def handle_exception(loop, context):
-        logger.exception(f"VisualServer Caught exception: {str(context)}")
+        logger.exception(f"AudioServer Caught exception: {str(context)}")
 
     loop = asyncio.new_event_loop()
     loop.set_exception_handler(handle_exception)
diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py
index a8a2c39c3e..cbd39666a0 100644
--- a/lightllm/server/audioserver/model_infer/model_rpc.py
+++ b/lightllm/server/audioserver/model_infer/model_rpc.py
@@ -1,22 +1,25 @@
 import asyncio
 import rpyc
+import socket
 import torch
-from typing import Dict, List, Tuple
+import inspect
+from typing import List
+from rpyc.utils.classic import obtain
+from rpyc.utils.server import ThreadedServer
 from transformers.configuration_utils import PretrainedConfig
 from lightllm.models.whisper.whisper_audio import WhisperAudioModel
 from lightllm.models.qwen3_omni_moe_thinker.qwen3_omni_audio import Qwen3OmniMoeAudioEncoder
 from lightllm.server.multimodal_params import AudioItem
 from lightllm.utils.infer_utils import set_random_seed
 from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient
+from lightllm.utils.graceful_utils import graceful_registry
 
 
 class AudioModelRpcServer(rpyc.Service):
     def exposed_init_model(self, kvargs):
-        # 注册graceful 退出的处理
-        from lightllm.utils.graceful_utils import graceful_registry
-        import inspect
-
-        graceful_registry(inspect.currentframe().f_code.co_name)
+        kvargs = obtain(kvargs)
+        self.dp_rank_id = kvargs["dp_rank_id"]
+        torch.cuda.set_device(self.dp_rank_id)
 
         weight_dir = kvargs["weight_dir"]
         model_cfg, _ = PretrainedConfig.get_config_dict(weight_dir)
@@ -41,7 +44,7 @@ def exposed_init_model(self, kvargs):
             # CpuEmbedCacheClient 的初始化需要依赖这个设置的环境信息。
             from lightllm.utils.dist_utils import set_current_device_id
 
-            set_current_device_id(torch.cuda.current_device())
+            set_current_device_id(self.dp_rank_id)
 
             self.cpu_embed_cache_client = CpuEmbedCacheClient(
                 create_meta_data=False,
@@ -65,6 +68,8 @@ def forward(self, audios):
 
     # @calculate_time(show=False, min_cost_ms=300)
     def exposed_encode(self, audios):
+        torch.cuda.set_device(self.dp_rank_id)
+        audios = obtain(audios)
         return self.forward(audios)
 
 
@@ -74,6 +79,7 @@ def __init__(self, model_rpc, world_size, rpc_server_process=None):
         self.world_size = world_size
         self.rpc_server_process = rpc_server_process
         self.use_rpc = self.world_size != 1
+
         if self.use_rpc:
 
             def async_wrap(f):
@@ -82,7 +88,6 @@ def async_wrap(f):
                 async def _func(*args, **kwargs):
                     ans = f(*args, **kwargs)
                     await asyncio.to_thread(ans.wait)
-                    # raise if exception
                     return ans.value
 
                 return _func
@@ -95,21 +100,52 @@ async def _func(*args, **kwargs):
         return
 
     async def init_model(self, kvargs):
-        ans: rpyc.AsyncResult = self._init_model(kvargs)
+        ans = self._init_model(kvargs)
         if self.use_rpc:
-            await ans
-            return
-        else:
-            return
+            return await ans
+        return ans
 
     async def encode(self, audios: List[AudioItem]):
         ans = self._encode(audios)
         if self.use_rpc:
             return await ans
-        else:
-            return ans
+        return ans
+
 
+def _init_env(port, device_id):
+    graceful_registry(inspect.currentframe().f_code.co_name)
+    torch.cuda.set_device(device_id)
 
-async def start_model_process(world_size):
+    from lightllm.utils.dist_utils import set_current_device_id
+    import lightllm.utils.rpyc_fix_utils as _
+
+    set_current_device_id(device_id)
+    t = ThreadedServer(AudioModelRpcServer(), port=port, protocol_config={"allow_pickle": True})
+    t.start()
+    return
+
+
+async def start_model_process(world_size, port=None, device_id=None):
     if world_size == 1:
         return AudioModelRpcClient(AudioModelRpcServer(), world_size)
+
+    import multiprocessing
+
+    proc = multiprocessing.Process(target=_init_env, args=(port, device_id))
+    proc.start()
+    await asyncio.sleep(2)
+    repeat_count = 0
+    while repeat_count < 20:
+        try:
+            con = rpyc.connect("localhost", port, config={"allow_pickle": True})
+            con._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
+            break
+        except BaseException:
+            await asyncio.sleep(1)
+        repeat_count += 1
+
+    if repeat_count == 20:
+        raise Exception("init rpc env error!")
+
+    assert proc.is_alive()
+    return AudioModelRpcClient(con.root, world_size, rpc_server_process=proc)
diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
index 37c022f3a3..8411a14e3c 100644
--- a/lightllm/server/core/objs/start_args_type.py
+++ b/lightllm/server/core/objs/start_args_type.py
@@ -99,10 +99,12 @@ class StartArgs:
     grouping_key: List[str] = field(default_factory=list)
     push_interval: int = field(default=10)
     visual_infer_batch_size: int = field(default=None)
+    audio_infer_batch_size: int = field(default=None)
     visual_send_batch_size: int = field(default=1)
     visual_gpu_ids: List[int] = field(default_factory=lambda: [0])
     visual_tp: int = field(default=1)
     visual_dp: int = field(default=1)
+    audio_dp: int = field(default=1)
     visual_nccl_ports: List[int] = field(default=None)
     enable_monitor_auth: bool = field(default=False)
     disable_cudagraph: bool = field(default=False)
@@ -125,7 +127,9 @@ class StartArgs:
     vit_att_backend: List[str] = field(
         default=("auto",), metadata={"choices": ["auto", "triton", "fa3", "sdpa", "xformers"]}
     )
-    llm_kv_type: str = field(default="None", metadata={"choices": ["None", "int8kv", "int4kv", "fp8kv_sph", "fp8kv_spt"]})
+    llm_kv_type: str = field(
+        default="None", metadata={"choices": ["None", "int8kv", "int4kv", "fp8kv_sph", "fp8kv_spt"]}
+    )
     llm_kv_quant_group_size: int = field(default=8)
     sampling_backend: str = field(default="triton", metadata={"choices": ["triton", "sglang_kernel"]})
     penalty_counter_mode: str = field(

From 671b5aa446b970c575c0f02ebb36d60f091e9ba8 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Fri, 27 Mar 2026 14:34:57 +0000
Subject: [PATCH 03/51] Add startup warmups for HTTP audio preload and per-rank
 audio workers to remove first-request audio cold-   start latency.

---
 .../qwen3_omni_audio.py                       |  19 ++++
 lightllm/models/whisper/whisper_audio.py      |  18 ++++
 lightllm/server/api_http.py                   |   7 +-
 lightllm/server/audioserver/manager.py        | 101 +++++++++++++++++-
 .../audioserver/model_infer/model_rpc.py      |  38 ++++++-
 lightllm/server/httpserver/manager.py         |  83 ++++++++++++++
 lightllm/server/multimodal_params.py          |  35 ++++++
 7 files changed, 297 insertions(+), 4 deletions(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index 3573ecde86..6c620448b9 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -389,3 +389,22 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
         if ids_to_set:
             self.cache_client.root.set_items_embed(ids=ids_to_set)
             torch.cuda.current_stream().synchronize()
+
+    @torch.no_grad()
+    def warmup(self, audio_bytes: bytes):
+        audio = BytesIO(audio_bytes)
+        audio, _ = librosa.load(audio, sr=self.processor.sampling_rate)
+        input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
+        if feature_attention_mask is not None:
+            audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
+            input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
+        else:
+            audio_feature_lengths = None
+
+        feature_lens = audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
+        _ = self.forward(
+            input_features,
+            feature_lens=feature_lens,
+        )
+        torch.cuda.current_stream().synchronize()
+        return
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
index 36c9408cb8..0493afdb9a 100644
--- a/lightllm/models/whisper/whisper_audio.py
+++ b/lightllm/models/whisper/whisper_audio.py
@@ -241,3 +241,21 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
         if ids_to_set:
             self.cache_client.root.set_items_embed(ids=ids_to_set)
             torch.cuda.current_stream().synchronize()
+
+    @torch.no_grad()
+    def warmup(self, audio_bytes: bytes):
+        audio = BytesIO(audio_bytes)
+        audio, _ = librosa.load(audio, sr=16000)
+
+        from .defaults import MIN_AUDIO_LEN
+
+        if audio.shape[0] < MIN_AUDIO_LEN:
+            audio = np.pad(audio, (0, MIN_AUDIO_LEN - len(audio)), mode="constant", constant_values=0.0)
+
+        batch_audio_lens = np.array([min(audio.shape[0], self.max_length)], dtype=np.int32)
+        audios, audio_lens_after_cnn = self.audio_processor(
+            [audio], batch_audio_lens, sampling_rate=16000, return_tensors="pt"
+        )
+        _ = self.forward(audios, audio_lens_after_cnn)
+        torch.cuda.current_stream().synchronize()
+        return
diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py
index 230da5b369..6be738befc 100755
--- a/lightllm/server/api_http.py
+++ b/lightllm/server/api_http.py
@@ -40,7 +40,7 @@
 from fastapi.responses import Response, StreamingResponse, JSONResponse
 from lightllm.server.core.objs.sampling_params import SamplingParams
 from lightllm.server.core.objs import StartArgs
-from .multimodal_params import MultimodalParams
+from .multimodal_params import MultimodalParams, warmup_audio_preload
 from .httpserver.manager import HttpServerManager
 from .httpserver_for_pd_master.manager import HttpServerManagerForPDMaster
 from .api_lightllm import lightllm_get_score
@@ -359,6 +359,11 @@ async def startup_event():
     logger.info("server start up")
     loop = asyncio.get_event_loop()
     g_objs.set_args(get_env_start_args())
+    if g_objs.args.enable_multimodal and not g_objs.args.disable_audio:
+        warmup_start = time.time()
+        logger.info("http_audio_preload_warmup_start")
+        await warmup_audio_preload()
+        logger.info(f"http_audio_preload_warmup_done elapsed_ms:{(time.time() - warmup_start) * 1000.0:.3f}")
     loop.create_task(g_objs.httpserver_manager.handle_loop())
     logger.info(f"server start up ok, loop use is {asyncio.get_event_loop()}")
     return
diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py
index f7cb300aaf..b4fb002965 100644
--- a/lightllm/server/audioserver/manager.py
+++ b/lightllm/server/audioserver/manager.py
@@ -7,7 +7,8 @@
 import socket
 import inspect
 import setproctitle
-from typing import List
+import time
+from typing import Dict, List
 
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 from lightllm.utils.log_utils import init_logger
@@ -52,6 +53,27 @@ def __init__(
         self.audio_model_rpc_ports = audio_model_rpc_ports or [None] * self.audio_dp
         self.shm_req_manager = ShmReqManager()
         self.model_rpcs: List[AudioModelRpcClient] = []
+        self.req_stage_times: Dict[int, Dict[str, float]] = {}
+        self.next_module_port = args.multi_level_kv_cache_port if args.enable_cpu_cache else args.router_port
+
+    def _mark_req_stage(self, req_id: int, stage: str):
+        now = time.time()
+        req_stage_dict = self.req_stage_times.setdefault(req_id, {})
+        if "audio_recv" not in req_stage_dict:
+            req_stage_dict["audio_recv"] = now
+        req_stage_dict[stage] = now
+        return now - req_stage_dict["audio_recv"]
+
+    def _log_req_stage(self, req_id: int, stage: str, **kwargs):
+        elapsed_s = self._mark_req_stage(req_id, stage)
+        extras = " ".join(f"{k}:{v}" for k, v in kwargs.items())
+        suffix = f" {extras}" if extras else ""
+        logger.info(f"lightllm_req_id:{req_id} stage:{stage} elapsed_ms:{elapsed_s * 1000.0:.3f}{suffix}")
+        return
+
+    def _cleanup_req_stage(self, req_id: int):
+        self.req_stage_times.pop(req_id, None)
+        return
 
     async def wait_to_model_ready(self):
         self.model_rpcs = []
@@ -72,18 +94,37 @@ async def wait_to_model_ready(self):
             }
             init_model_ret.append(self.model_rpcs[dp_rank_id].init_model(kvargs))
         await asyncio.gather(*init_model_ret)
+
+        warmup_start = time.time()
+        logger.info(f"audio_warmup_start audio_dp:{self.audio_dp}")
+
+        async def warmup_one_rank(dp_rank_id: int):
+            rank_start = time.time()
+            logger.info(f"audio_warmup_rank_start dp_rank_id:{dp_rank_id}")
+            await self.model_rpcs[dp_rank_id].warmup_model()
+            logger.info(
+                f"audio_warmup_rank_done dp_rank_id:{dp_rank_id} elapsed_ms:{(time.time() - rank_start) * 1000.0:.3f}"
+            )
+
+        await asyncio.gather(*[warmup_one_rank(dp_rank_id) for dp_rank_id in range(self.audio_dp)])
+        logger.info(f"audio_warmup_done elapsed_ms:{(time.time() - warmup_start) * 1000.0:.3f}")
         return
 
     async def infer_audios(self, audios: List[AudioItem]):
         if len(audios) == 0:
             return
 
+        infer_start = time.time()
         rets = []
         for dp_rank_id in range(self.audio_dp):
             assigned_audios = [audios[i] for i in range(dp_rank_id, len(audios), self.audio_dp)]
             if assigned_audios:
                 rets.append(self.model_rpcs[dp_rank_id].encode(assigned_audios))
         await asyncio.gather(*rets)
+        logger.info(
+            f"audio_infer_batch_done audio_count:{len(audios)} audio_dp:{self.audio_dp} "
+            f"elapsed_ms:{(time.time() - infer_start) * 1000.0:.3f}"
+        )
 
         return
 
@@ -96,6 +137,11 @@ async def loop_for_fwd(self):
                 audios_need_infer = []
                 while len(self.waiting_reqs) > 0:
                     group_req_indexes = self.waiting_reqs.pop(0)
+                    self._log_req_stage(
+                        group_req_indexes.group_req_id,
+                        "audio_queue_picked",
+                        waiting_queue_size=len(self.waiting_reqs),
+                    )
                     shm_req = self.shm_req_manager.get_req_obj_by_index(group_req_indexes.shm_req_indexes[0])
                     disable_prompt_cache = shm_req.sample_params.disable_prompt_cache
                     is_aborted = shm_req.is_aborted
@@ -105,6 +151,7 @@ async def loop_for_fwd(self):
                         # 因为采用 shm 来映射所有的 req 对象以后，引用管理情况复杂了
                         # 需要一些一致的流程来保证不出现异步问题。
                         self.send_to_next_module.send_pyobj(group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
+                        self._cleanup_req_stage(group_req_indexes.group_req_id)
                         continue
 
                     multimodal_params = group_req_indexes.multimodal_params
@@ -116,28 +163,74 @@ async def loop_for_fwd(self):
                     else:
                         ready_audio = obtain(self.cache_client.root.get_items_embed(audio_uuids))
 
+                    current_req_has_pending_audio = False
                     for audio, ready in zip(multimodal_params.audios, ready_audio):
                         if not ready:
                             audios_need_infer.append(audio)
+                            current_req_has_pending_audio = True
 
                         if len(audios_need_infer) == self.infer_batch_size:
+                            batch_reqs = processing_group_reqs + (
+                                [group_req_indexes] if current_req_has_pending_audio else []
+                            )
+                            batch_req_ids = [req.group_req_id for req in batch_reqs]
+                            logger.info(
+                                f"audio_batch_ready req_ids:{batch_req_ids} "
+                                f"audio_count:{len(audios_need_infer)} infer_batch_size:{self.infer_batch_size}"
+                            )
+                            for batch_req_id in batch_req_ids:
+                                self._log_req_stage(
+                                    batch_req_id, "audio_infer_start", batch_audio_count=len(audios_need_infer)
+                                )
                             await self.infer_audios(audios_need_infer)
+                            for batch_req_id in batch_req_ids:
+                                self._log_req_stage(
+                                    batch_req_id, "audio_infer_done", batch_audio_count=len(audios_need_infer)
+                                )
                             audios_need_infer = []
                             for _group_req_indexes in processing_group_reqs:
+                                self._log_req_stage(
+                                    _group_req_indexes.group_req_id,
+                                    "audio_send_to_next_module",
+                                    target_port=self.next_module_port,
+                                )
                                 self.send_to_next_module.send_pyobj(
                                     _group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL
                                 )
+                                self._cleanup_req_stage(_group_req_indexes.group_req_id)
                             processing_group_reqs = []
 
                     if len(audios_need_infer) == 0:
+                        self._log_req_stage(
+                            group_req_indexes.group_req_id,
+                            "audio_send_to_next_module",
+                            target_port=self.next_module_port,
+                            pending_audio_count=0,
+                        )
                         self.send_to_next_module.send_pyobj(group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
+                        self._cleanup_req_stage(group_req_indexes.group_req_id)
                     else:
                         processing_group_reqs.append(group_req_indexes)
 
                 if len(audios_need_infer) > 0:
+                    batch_req_ids = [req.group_req_id for req in processing_group_reqs]
+                    logger.info(
+                        f"audio_batch_ready req_ids:{batch_req_ids} "
+                        f"audio_count:{len(audios_need_infer)} infer_batch_size:{self.infer_batch_size}"
+                    )
+                    for batch_req_id in batch_req_ids:
+                        self._log_req_stage(batch_req_id, "audio_infer_start", batch_audio_count=len(audios_need_infer))
                     await self.infer_audios(audios_need_infer)
+                    for batch_req_id in batch_req_ids:
+                        self._log_req_stage(batch_req_id, "audio_infer_done", batch_audio_count=len(audios_need_infer))
                     for _group_req_indexes in processing_group_reqs:
+                        self._log_req_stage(
+                            _group_req_indexes.group_req_id,
+                            "audio_send_to_next_module",
+                            target_port=self.next_module_port,
+                        )
                         self.send_to_next_module.send_pyobj(_group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
+                        self._cleanup_req_stage(_group_req_indexes.group_req_id)
                     processing_group_reqs = []
                     audios_need_infer = []
 
@@ -149,6 +242,12 @@ async def loop_for_netio_req(self):
                     f"audio recv req id {recv_req.group_req_id} "
                     f"audio count {len(recv_req.multimodal_params.audios)}"
                 )
+                self._log_req_stage(
+                    recv_req.group_req_id,
+                    "audio_recv",
+                    audio_count=len(recv_req.multimodal_params.audios),
+                    waiting_queue_size=len(self.waiting_reqs),
+                )
                 self.waiting_reqs.append(recv_req)
             else:
                 assert False, f"Error Req Inf {recv_req}"
diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py
index cbd39666a0..8db3be7f35 100644
--- a/lightllm/server/audioserver/model_infer/model_rpc.py
+++ b/lightllm/server/audioserver/model_infer/model_rpc.py
@@ -1,8 +1,12 @@
 import asyncio
-import rpyc
+import inspect
+import io
 import socket
+import wave
+
+import numpy as np
+import rpyc
 import torch
-import inspect
 from typing import List
 from rpyc.utils.classic import obtain
 from rpyc.utils.server import ThreadedServer
@@ -13,6 +17,21 @@
 from lightllm.utils.infer_utils import set_random_seed
 from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient
 from lightllm.utils.graceful_utils import graceful_registry
+from lightllm.utils.log_utils import init_logger
+
+
+logger = init_logger(__name__)
+
+
+def _generate_silence_wav_bytes(sample_rate: int = 16000, num_samples: int = 16000) -> bytes:
+    samples = np.zeros(num_samples, dtype=np.int16)
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(1)
+        wav_file.setsampwidth(2)
+        wav_file.setframerate(sample_rate)
+        wav_file.writeframes(samples.tobytes())
+    return buffer.getvalue()
 
 
 class AudioModelRpcServer(rpyc.Service):
@@ -72,6 +91,13 @@ def exposed_encode(self, audios):
         audios = obtain(audios)
         return self.forward(audios)
 
+    def exposed_warmup_model(self):
+        torch.cuda.set_device(self.dp_rank_id)
+        warmup_audio = _generate_silence_wav_bytes()
+        self.model.warmup(warmup_audio)
+        logger.info(f"audio model warmup finished on dp_rank_id:{self.dp_rank_id}")
+        return
+
 
 class AudioModelRpcClient:
     def __init__(self, model_rpc, world_size, rpc_server_process=None):
@@ -94,9 +120,11 @@ async def _func(*args, **kwargs):
 
             self._init_model = async_wrap(self.model.init_model)
             self._encode = async_wrap(self.model.encode)
+            self._warmup_model = async_wrap(self.model.warmup_model)
         else:
             self._init_model = self.model.exposed_init_model
             self._encode = self.model.exposed_encode
+            self._warmup_model = self.model.exposed_warmup_model
         return
 
     async def init_model(self, kvargs):
@@ -111,6 +139,12 @@ async def encode(self, audios: List[AudioItem]):
             return await ans
         return ans
 
+    async def warmup_model(self):
+        ans = self._warmup_model()
+        if self.use_rpc:
+            return await ans
+        return ans
+
 
 def _init_env(port, device_id):
     graceful_registry(inspect.currentframe().f_code.co_name)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index e28e4c93ad..3a818b0a39 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -124,6 +124,13 @@ def __init__(
         self.latest_success_infer_time_mark.set_value(int(time.time()))
         return
 
+    def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str, **kwargs):
+        cost_ms = (time.time() - start_time) * 1000.0
+        extras = " ".join(f"{k}:{v}" for k, v in kwargs.items())
+        suffix = f" {extras}" if extras else ""
+        logger.info(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}")
+        return
+
     async def _alloc_resource(self, items, md5sums, token_nums, datas):
 
         while True:
@@ -287,6 +294,10 @@ async def generate(
         start_time = time.time()
         request_headers = request.headers if request is not None else {}
         group_request_id = self.alloc_req_id(sampling_params, is_health_req)
+        if request is not None:
+            request.state.lightllm_req_id = group_request_id
+        audio_count = len(multimodal_params.audios) if multimodal_params is not None else 0
+        image_count = len(multimodal_params.images) if multimodal_params is not None else 0
 
         try:
             original_multimodal_params = None
@@ -295,11 +306,26 @@ async def generate(
 
             if self.pd_mode.is_P_or_NORMAL():
                 await multimodal_params.verify_and_preload(request)
+                self._log_stage_timing(
+                    group_request_id,
+                    start_time,
+                    "verify_and_preload_done",
+                    audio_count=audio_count,
+                    image_count=image_count,
+                )
 
             # 记录请求到达的相关信息
             await self._log_req_header(request_headers, group_request_id)
             # encode
             prompt_ids = await self._encode(prompt, multimodal_params, sampling_params)
+            self._log_stage_timing(
+                group_request_id,
+                start_time,
+                "encode_done",
+                prompt_tokens=len(prompt_ids),
+                audio_count=audio_count,
+                image_count=image_count,
+            )
 
             prompt_tokens = len(prompt_ids)
             # 监控
@@ -308,6 +334,13 @@ async def generate(
                 self.metric_client.histogram_observe("lightllm_request_input_length", prompt_tokens)
                 self.metric_client.histogram_observe("lightllm_request_max_new_tokens", sampling_params.max_new_tokens)
             prompt_ids = await self._check_and_repair_length(prompt_ids, sampling_params)
+            self._log_stage_timing(
+                group_request_id,
+                start_time,
+                "check_and_repair_length_done",
+                prompt_tokens=len(prompt_ids),
+                max_new_tokens=sampling_params.max_new_tokens,
+            )
 
             if nixl_pd_upload_websocket is not None and not is_health_req and self.pd_mode.is_NP():
                 # 在 nixl pd 模式下的 p 节点， 为了更好的兼容多模态的推理流程，np 节点需要先上报其 encode 好的 prompt ids 信息，然后
@@ -355,6 +388,12 @@ async def generate(
                     chunked_prefill_size=self.args.chunked_prefill_size,
                 )
                 req_objs.append(req_obj)
+            self._log_stage_timing(
+                group_request_id,
+                start_time,
+                "shm_req_init_done",
+                req_count=len(req_objs),
+            )
 
             logger.debug(
                 f"alloc shm_req for req_id {group_request_id}, "
@@ -368,6 +407,13 @@ async def generate(
             await self.transfer_to_next_module_or_node(
                 prompt, sampling_params, original_multimodal_params, req_status.group_req_objs
             )
+            self._log_stage_timing(
+                group_request_id,
+                start_time,
+                "request_forwarded",
+                has_audio=audio_count > 0,
+                has_image=image_count > 0,
+            )
 
             results_generator = self._wait_to_token_package(
                 start_time,
@@ -445,7 +491,15 @@ async def _encode(
                 ), "too many multimodal items!"
                 if multimodal_params.audios:
                     assert not self.args.disable_audio, "audio multimodal not enabled"
+                encode_start_time = time.time()
                 await self._alloc_multimodal_resources(multimodal_params, sampling_params)
+                log_req_id = getattr(sampling_params, "group_request_id", None)
+                logger.info(
+                    f"lightllm_req_id:{log_req_id} "
+                    f"stage:alloc_multimodal_resources_done "
+                    f"elapsed_ms:{(time.time() - encode_start_time) * 1000.0:.3f} "
+                    f"audio_count:{len(multimodal_params.audios)} image_count:{len(multimodal_params.images)}"
+                )
                 prompt_ids = self.tokenizer.encode(
                     prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens
                 )
@@ -539,20 +593,39 @@ async def transfer_to_next_module(
 
         if self.pd_mode.is_P_or_NORMAL():
             if not self.args.disable_vision:
+                logger.info(
+                    f"lightllm_req_id:{group_req_objs.group_req_id} "
+                    f"stage:transfer_to_visual "
+                    f"target_port:{self.args.visual_port}"
+                )
                 self.send_to_visual.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL)
                 return
 
             if not self.args.disable_audio:
+                logger.info(
+                    f"lightllm_req_id:{group_req_objs.group_req_id} "
+                    f"stage:transfer_to_audio "
+                    f"target_port:{self.args.audio_port}"
+                )
                 self.send_to_audio.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL)
                 return
 
             if self.args.enable_cpu_cache:
+                logger.info(
+                    f"lightllm_req_id:{group_req_objs.group_req_id} "
+                    f"stage:transfer_to_multi_level_kv_cache target_port:{self.args.multi_level_kv_cache_port}"
+                )
                 self.send_to_multi_level_kv_cache.send_pyobj(
                     group_req_objs.to_group_req_index(),
                     protocol=pickle.HIGHEST_PROTOCOL,
                 )
                 return
 
+            logger.info(
+                f"lightllm_req_id:{group_req_objs.group_req_id} "
+                f"stage:transfer_to_router "
+                f"target_port:{self.args.router_port}"
+            )
             self.send_to_router.send_pyobj(
                 group_req_objs.to_group_req_index(),
                 protocol=pickle.HIGHEST_PROTOCOL,
@@ -561,6 +634,11 @@ async def transfer_to_next_module(
 
         if self.pd_mode.is_D():
             # 在 D 模式下，不需要传输真的多模态参数，因为其已经被 P 处理好了
+            logger.info(
+                f"lightllm_req_id:{group_req_objs.group_req_id} "
+                f"stage:transfer_to_router_from_decode "
+                f"target_port:{self.args.router_port}"
+            )
             self.send_to_router.send_pyobj(
                 group_req_objs.to_group_req_index(),
                 protocol=pickle.HIGHEST_PROTOCOL,
@@ -619,6 +697,11 @@ async def _wait_to_token_package(
                         first_token_cost_ms = (time.time() - start_time) * 1000
                         is_first_token = False
                         self.first_time_costs.add(first_token_cost_ms)
+                        logger.info(
+                            f"lightllm_req_id:{group_request_id} "
+                            f"stage:first_token_arrived elapsed_ms:{first_token_cost_ms:.3f} "
+                            f"sub_req_id:{sub_req_id} prompt_tokens:{prompt_tokens}"
+                        )
 
                     out_token_counter += 1
 
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 09a07455b3..cd9d652ab8 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -1,5 +1,7 @@
 """Multimodal parameters for text generation."""
 import os
+import wave
+import time
 import librosa
 import base64
 from typing import List
@@ -12,6 +14,17 @@
 logger = init_logger(__name__)
 
 
+def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float = 1.0) -> bytes:
+    num_samples = max(1, int(sample_rate * duration_seconds))
+    with BytesIO() as buffer:
+        with wave.open(buffer, "wb") as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)
+            wav_file.setframerate(sample_rate)
+            wav_file.writeframes(b"\x00\x00" * num_samples)
+        return buffer.getvalue()
+
+
 class AudioItem:
     def __init__(self, **kwargs):
         self._type = kwargs["type"]
@@ -32,6 +45,9 @@ def __init__(self, **kwargs):
 
     async def preload(self, request: Request):
         try:
+            req_id = getattr(getattr(request, "state", None), "lightllm_req_id", None)
+            preload_start = time.time()
+            source_ready_start = preload_start
             if self._type == "url":
                 timeout = int(os.getenv("REQUEST_TIMEOUT", "5"))
                 proxy = os.getenv("REQUEST_PROXY", None)
@@ -40,13 +56,22 @@ async def preload(self, request: Request):
                 audio_data = base64.b64decode(self._data)
             else:
                 raise ValueError(f"cannot read audio which type is {self._type}!")
+            source_ready_cost_ms = (time.time() - source_ready_start) * 1000.0
 
             # check if valid audio bytes
+            decode_start = time.time()
             audio_values, _ = librosa.load(BytesIO(audio_data), sr=16000)
+            decode_cost_ms = (time.time() - decode_start) * 1000.0
             from lightllm.models.whisper.defaults import MIN_AUDIO_LEN
 
             self.audio_length = max(audio_values.shape[0], MIN_AUDIO_LEN)  # 如果音频过短，会被pad到480的长度
             self._preload_data = audio_data
+            logger.info(
+                f"lightllm_req_id:{req_id} stage:audio_preload_done "
+                f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} "
+                f"source_type:{self._type} source_ready_ms:{source_ready_cost_ms:.3f} "
+                f"decode_ms:{decode_cost_ms:.3f} audio_length:{self.audio_length}"
+            )
             return
 
         except Exception as e:
@@ -184,3 +209,13 @@ def to_origin_dict(self):
         ret["images"] = [i.to_origin_dict() for i in self.images]
         ret["audios"] = [a.to_origin_dict() for a in self.audios]
         return ret
+
+
+async def warmup_audio_preload():
+    warmup_audio = AudioItem(
+        type="base64",
+        data=base64.b64encode(generate_silence_wav_bytes()).decode("utf-8"),
+    )
+    await warmup_audio.preload(None)
+    warmup_audio.read()
+    return

From a3872599dc98eecd98e28915e0d77f09d96e61ec Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Mon, 30 Mar 2026 00:45:08 +0000
Subject: [PATCH 04/51] add http client cache

---
 lightllm/utils/multimodal_utils.py | 35 +++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/lightllm/utils/multimodal_utils.py b/lightllm/utils/multimodal_utils.py
index 14c8303273..6e3766f950 100644
--- a/lightllm/utils/multimodal_utils.py
+++ b/lightllm/utils/multimodal_utils.py
@@ -3,10 +3,14 @@
 import httpx
 from PIL import Image
 from io import BytesIO
+from urllib.parse import urlparse
+from typing import Dict, Optional
 from fastapi import Request
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
+_HTTP_CLIENTS: Dict[Optional[str], httpx.AsyncClient] = {}
+_LOCAL_RESOURCE_HOSTS = {"127.0.0.1", "localhost", "::1"}
 
 
 def image2base64(img_str: str):
@@ -21,20 +25,25 @@ def image2base64(img_str: str):
 async def fetch_resource(url, request: Request, timeout, proxy=None):
     logger.info(f"Begin to download resource from url: {url}")
     start_time = time.time()
-    async with httpx.AsyncClient(proxy=proxy) as client:
-        async with client.stream("GET", url, timeout=timeout) as response:
-            response.raise_for_status()
-            ans_bytes = []
-            async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
-                if request is not None and await request.is_disconnected():
-                    await response.aclose()
-                    raise Exception("Request disconnected. User cancelled download.")
-                ans_bytes.append(chunk)
-                # 接收的数据不能大于128M
-                if len(ans_bytes) > 128:
-                    raise Exception(f"url {url} recv data is too big")
+    hostname = urlparse(url).hostname
+    effective_proxy = None if hostname in _LOCAL_RESOURCE_HOSTS else proxy
+    client = _HTTP_CLIENTS.get(effective_proxy)
+    if client is None:
+        client = httpx.AsyncClient(proxy=effective_proxy)
+        _HTTP_CLIENTS[effective_proxy] = client
+    async with client.stream("GET", url, timeout=timeout) as response:
+        response.raise_for_status()
+        ans_bytes = []
+        async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
+            if request is not None and await request.is_disconnected():
+                await response.aclose()
+                raise Exception("Request disconnected. User cancelled download.")
+            ans_bytes.append(chunk)
+            # 接收的数据不能大于128M
+            if len(ans_bytes) > 128:
+                raise Exception(f"url {url} recv data is too big")
 
-            content = b"".join(ans_bytes)
+        content = b"".join(ans_bytes)
     end_time = time.time()
     cost_time = end_time - start_time
     logger.info(f"Download url {url} resource cost time: {cost_time} seconds")

From cd89cd613117c33a5900dc2fb2466ea2d5599797 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Mon, 30 Mar 2026 00:48:15 +0000
Subject: [PATCH 05/51] reduce polling time

---
 lightllm/server/audioserver/manager.py | 7 ++++++-
 lightllm/server/router/manager.py      | 9 +++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py
index b4fb002965..ac4058b643 100644
--- a/lightllm/server/audioserver/manager.py
+++ b/lightllm/server/audioserver/manager.py
@@ -55,6 +55,7 @@ def __init__(
         self.model_rpcs: List[AudioModelRpcClient] = []
         self.req_stage_times: Dict[int, Dict[str, float]] = {}
         self.next_module_port = args.multi_level_kv_cache_port if args.enable_cpu_cache else args.router_port
+        self.waiting_reqs_event = asyncio.Event()
 
     def _mark_req_stage(self, req_id: int, stage: str):
         now = time.time()
@@ -131,7 +132,10 @@ async def infer_audios(self, audios: List[AudioItem]):
     async def loop_for_fwd(self):
         while True:
             if len(self.waiting_reqs) == 0:
-                await asyncio.sleep(0.01)  # 10ms
+                self.waiting_reqs_event.clear()
+                if len(self.waiting_reqs) == 0:
+                    await self.waiting_reqs_event.wait()
+                continue
             else:
                 processing_group_reqs = []
                 audios_need_infer = []
@@ -249,6 +253,7 @@ async def loop_for_netio_req(self):
                     waiting_queue_size=len(self.waiting_reqs),
                 )
                 self.waiting_reqs.append(recv_req)
+                self.waiting_reqs_event.set()
             else:
                 assert False, f"Error Req Inf {recv_req}"
 
diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py
index 0d2705fab2..f5e0b8df9a 100644
--- a/lightllm/server/router/manager.py
+++ b/lightllm/server/router/manager.py
@@ -316,8 +316,7 @@ async def _add_batch(self, batch: Batch):
         # 添加新请求
         reqs = [r.to_router_rpc_obj() for r in batch.reqs]
         while not self.shm_reqs_io_buffer.is_empty():
-            await asyncio.sleep(0.02)
-
+            await asyncio.sleep(0.001)
         self.shm_reqs_io_buffer.write_obj(reqs)
         self.shm_reqs_io_buffer.set_ready()
         logger.debug(f"Prefill Batch: {batch.simple_log()} \n")
@@ -326,8 +325,7 @@ async def _add_batch(self, batch: Batch):
     async def _aborted_reqs(self, aborted_reqs: List[Req]):
         cmds = [AbortedReqCmd(req_id=r.request_id) for r in aborted_reqs]
         while not self.shm_reqs_io_buffer.is_empty():
-            await asyncio.sleep(0.02)
-
+            await asyncio.sleep(0.001)
         self.shm_reqs_io_buffer.write_obj(cmds)
         self.shm_reqs_io_buffer.set_ready()
         return
@@ -335,8 +333,7 @@ async def _aborted_reqs(self, aborted_reqs: List[Req]):
     async def _stop_str_matched_reqs(self, stop_str_matched_reqs: List[Req]):
         cmds = [StopStrMatchedReqCmd(req_id=r.request_id) for r in stop_str_matched_reqs]
         while not self.shm_reqs_io_buffer.is_empty():
-            await asyncio.sleep(0.02)
-
+            await asyncio.sleep(0.001)
         self.shm_reqs_io_buffer.write_obj(cmds)
         self.shm_reqs_io_buffer.set_ready()
         return

From 4788980006dc0be673e151d0c9c8f4cf12afcfdf Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Mon, 30 Mar 2026 01:27:53 +0000
Subject: [PATCH 06/51] Optimize audio shm payload handling and cache lookups

---
 .../qwen3_omni_audio.py                       | 47 ++++++++++++-------
 lightllm/models/whisper/whisper_audio.py      | 20 ++------
 .../embed_cache/impl/naive_memory_cache.py    |  2 +
 lightllm/server/httpserver/manager.py         | 21 ++++-----
 lightllm/server/multimodal_params.py          | 37 ++++++++++++++-
 5 files changed, 82 insertions(+), 45 deletions(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index 6c620448b9..424a768bbf 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -1,6 +1,7 @@
 import os
 import json
 import math
+import time
 import torch
 import rpyc
 import librosa
@@ -10,16 +11,18 @@
 from safetensors import safe_open
 from torch.nn import functional as F
 from typing import Callable, Optional, Union, List
-from rpyc.utils.classic import obtain
-
 from transformers.activations import ACT2FN
 
-from lightllm.server.multimodal_params import AudioItem
+from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload
 from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
 from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
 from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor
+from lightllm.utils.log_utils import init_logger
+
+
+logger = init_logger(__name__)
 
 
 def _get_feat_extract_output_lengths(input_lengths):
@@ -338,6 +341,11 @@ def forward(
         return hidden_states
 
     def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedCacheClient):
+        encode_start = time.time()
+        load_shm_cost = 0.0
+        preprocess_cost = 0.0
+        forward_cost = 0.0
+        cache_copy_cost = 0.0
         uuids = []
         items: List[AudioItem] = []
         per_audio_features: List[torch.Tensor] = []
@@ -345,12 +353,14 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
             if isinstance(item, AudioItem):
                 uuids.append(item.uuid)
                 items.append(item)
+                load_start = time.time()
                 audio_data = read_shm(get_shm_name_data(item.uuid))
-                audio = BytesIO(audio_data)
-                audio, _ = librosa.load(audio, sr=self.processor.sampling_rate)
+                audio = load_audio_from_shm_payload(audio_data, item.extra_params, self.processor.sampling_rate)
+                load_shm_cost += time.time() - load_start
             else:
                 raise ValueError(f"cannot read audio which type is {type(item)}!")
 
+            preprocess_start = time.time()
             input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
             if feature_attention_mask is not None:
                 audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
@@ -361,22 +371,19 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
             feature_lens = (
                 audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
             )
+            preprocess_cost += time.time() - preprocess_start
 
+            forward_start = time.time()
             audio_features = self.forward(
                 input_features,
                 feature_lens=feature_lens,
             )
+            forward_cost += time.time() - forward_start
             per_audio_features.append(audio_features)
 
-        ready_audio = obtain(self.cache_client.root.get_items_embed(uuids))
-        ids_to_set = []
-        for i, ready in enumerate(ready_audio):
-            if ready:
-                continue
-
-            uid = uuids[i]
+        cache_copy_start = time.time()
+        for i, uid in enumerate(uuids):
             item = items[i]
-
             cur_embed = per_audio_features[i]
             cpu_embed_cache_client.copy_to_cache(
                 embed_tensor=cur_embed, start_index_in_cache=item.start_index_in_embed_cache
@@ -384,11 +391,19 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
             assert (
                 item.token_num == cur_embed.shape[0]
             ), f"audio token num not match {item.token_num} vs {cur_embed.shape[0]} "
-            ids_to_set.append(uid)
 
-        if ids_to_set:
-            self.cache_client.root.set_items_embed(ids=ids_to_set)
+        if uuids:
             torch.cuda.current_stream().synchronize()
+            self.cache_client.root.set_items_embed(ids=uuids)
+        cache_copy_cost += time.time() - cache_copy_start
+        logger.info(
+            f"audio_encode_batch_done audio_count:{len(audio_items)} "
+            f"load_shm_ms:{load_shm_cost * 1000.0:.3f} "
+            f"preprocess_ms:{preprocess_cost * 1000.0:.3f} "
+            f"forward_ms:{forward_cost * 1000.0:.3f} "
+            f"cache_ms:{cache_copy_cost * 1000.0:.3f} "
+            f"elapsed_ms:{(time.time() - encode_start) * 1000.0:.3f}"
+        )
 
     @torch.no_grad()
     def warmup(self, audio_bytes: bytes):
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
index 0493afdb9a..a94d22dd0c 100644
--- a/lightllm/models/whisper/whisper_audio.py
+++ b/lightllm/models/whisper/whisper_audio.py
@@ -10,8 +10,7 @@
 from safetensors.torch import load_file
 from transformers.processing_utils import ProcessorMixin
 from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
-from lightllm.server.multimodal_params import AudioItem
-from rpyc.utils.classic import obtain
+from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload
 from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient
 
 # tokenizer_class removed
@@ -175,8 +174,7 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
                 uuids.append(item.uuid)
                 items.append(item)
                 audio_data = read_shm(get_shm_name_data(item.uuid))
-                audio = BytesIO(audio_data)
-                audio, _ = librosa.load(audio, sr=16000)
+                audio = load_audio_from_shm_payload(audio_data, item.extra_params, 16000)
             else:
                 raise ValueError(f"cannot read audio which type is {type(item)}!")
 
@@ -222,25 +220,17 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
                 continue
             per_audio_embeds[owner].append(audios[chunk_idx][:token_len])
 
-        ready_audio = obtain(self.cache_client.root.get_items_embed(uuids))
-        ids_to_set = []
-        for i, ready in enumerate(ready_audio):
-            if ready:
-                continue
-
-            uid = uuids[i]
+        for i, uid in enumerate(uuids):
             item = items[i]
-
             # 拼接该 audio 的所有 chunk embedding
             cur_embed = torch.cat(per_audio_embeds[i], dim=0)
             cpu_embed_cache_client.copy_to_cache(
                 embed_tensor=cur_embed, start_index_in_cache=item.start_index_in_embed_cache
             )
-            ids_to_set.append(uid)
 
-        if ids_to_set:
-            self.cache_client.root.set_items_embed(ids=ids_to_set)
+        if uuids:
             torch.cuda.current_stream().synchronize()
+            self.cache_client.root.set_items_embed(ids=uuids)
 
     @torch.no_grad()
     def warmup(self, audio_bytes: bytes):
diff --git a/lightllm/server/embed_cache/impl/naive_memory_cache.py b/lightllm/server/embed_cache/impl/naive_memory_cache.py
index 5ad26fbcc8..ff7b2374b2 100644
--- a/lightllm/server/embed_cache/impl/naive_memory_cache.py
+++ b/lightllm/server/embed_cache/impl/naive_memory_cache.py
@@ -205,6 +205,8 @@ def alloc(self, md5sum_list: list[str], token_num_list: list[int]) -> Optional[l
                             "token_id": rec.token_id,
                             "start_index_in_embed_cache": rec.mem_block.start,
                             "token_num": rec.token_num,
+                            "data_ready": rec.data,
+                            "embed_ready": rec.embed,
                         }
                     )
 
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 3a818b0a39..8b3be9b0e8 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -144,23 +144,17 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas):
                 logger.error(str(records) + "and try to set --embed_cache_storage_size bigger")
                 raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger")
 
-            uid_list = []
-            for item, rec in zip(items, records):
+            update_data_ids = []
+            for item, rec, data in zip(items, records, datas):
                 item: Union[ImageItem, AudioItem] = item
                 item.uuid = rec["id"]
                 item.token_id = rec["token_id"]
                 item.token_num = rec["token_num"]
                 item.start_index_in_embed_cache = rec["start_index_in_embed_cache"]
 
-                uid_list.append(rec["id"])
-
-            ready_flags = obtain(self.cache_client.root.get_items_data(uid_list))
-            update_data_ids = []
-
-            for uid, ready, data in zip(uid_list, ready_flags, datas):
-                if not ready:
-                    create_shm(get_shm_name_data(uid), data)
-                    update_data_ids.append(uid)
+                if not rec["data_ready"]:
+                    create_shm(get_shm_name_data(rec["id"]), data)
+                    update_data_ids.append(rec["id"])
 
             if update_data_ids:
                 self.cache_client.root.set_items_data(update_data_ids)
@@ -188,7 +182,10 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
                     self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
                     data = audio.read()
                     token_num = self.tokenizer.get_audio_token_length(audio)
-                    md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(audio.extra_params)))
+                    payload_md5 = audio.extra_params.get("audio_payload_md5")
+                    if payload_md5 is None:
+                        payload_md5 = hashlib.md5(data).hexdigest()
+                    md5sum = payload_md5 + "_" + str(hash(frozendict(audio.extra_params)))
                     md5sums.append(md5sum)
                     tokens_nums.append(token_num)
                     datas.append(data)
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index cd9d652ab8..13a26d9b57 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -4,6 +4,8 @@
 import time
 import librosa
 import base64
+import hashlib
+import numpy as np
 from typing import List
 from io import BytesIO
 from PIL import Image
@@ -12,6 +14,9 @@
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
+RAW_AUDIO_SHM_FORMAT = "raw_audio_bytes"
+WAVEFORM_F32_SHM_FORMAT = "waveform_f32"
+AUDIO_SHM_USE_RAW_ENV = "LIGHTLLM_AUDIO_SHM_USE_RAW"
 
 
 def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float = 1.0) -> bytes:
@@ -25,6 +30,22 @@ def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float
         return buffer.getvalue()
 
 
+def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray:
+    audio_shm_format = extra_params.get("audio_shm_format", RAW_AUDIO_SHM_FORMAT)
+    if audio_shm_format == WAVEFORM_F32_SHM_FORMAT:
+        num_samples = int(extra_params.get("audio_num_samples", 0))
+        if num_samples > 0:
+            return np.frombuffer(audio_data, dtype=np.float32, count=num_samples)
+        return np.frombuffer(audio_data, dtype=np.float32)
+
+    audio, _ = librosa.load(BytesIO(audio_data), sr=sample_rate)
+    return np.asarray(audio, dtype=np.float32)
+
+
+def should_use_raw_audio_shm() -> bool:
+    return os.getenv(AUDIO_SHM_USE_RAW_ENV, "0") == "1"
+
+
 class AudioItem:
     def __init__(self, **kwargs):
         self._type = kwargs["type"]
@@ -61,16 +82,28 @@ async def preload(self, request: Request):
             # check if valid audio bytes
             decode_start = time.time()
             audio_values, _ = librosa.load(BytesIO(audio_data), sr=16000)
+            audio_values = np.asarray(audio_values, dtype=np.float32)
             decode_cost_ms = (time.time() - decode_start) * 1000.0
             from lightllm.models.whisper.defaults import MIN_AUDIO_LEN
 
             self.audio_length = max(audio_values.shape[0], MIN_AUDIO_LEN)  # 如果音频过短，会被pad到480的长度
-            self._preload_data = audio_data
+            if should_use_raw_audio_shm():
+                self.extra_params["audio_shm_format"] = RAW_AUDIO_SHM_FORMAT
+                self.extra_params.pop("audio_sample_rate", None)
+                self.extra_params.pop("audio_num_samples", None)
+                self._preload_data = audio_data
+            else:
+                self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT
+                self.extra_params["audio_sample_rate"] = 16000
+                self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
+                self._preload_data = audio_values.tobytes()
+            self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest()
             logger.info(
                 f"lightllm_req_id:{req_id} stage:audio_preload_done "
                 f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} "
                 f"source_type:{self._type} source_ready_ms:{source_ready_cost_ms:.3f} "
-                f"decode_ms:{decode_cost_ms:.3f} audio_length:{self.audio_length}"
+                f"decode_ms:{decode_cost_ms:.3f} audio_length:{self.audio_length} "
+                f"shm_format:{self.extra_params['audio_shm_format']}"
             )
             return
 

From 7b05403af6df9f42d294c5b28ee76fd7c4b89342 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Mon, 30 Mar 2026 01:31:21 +0000
Subject: [PATCH 07/51] cache hann_window/mel_filters

---
 .../qwen3_omni_moe_thinker/audio_process.py    | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
index 833cc8f4b0..e9dc931886 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
@@ -46,10 +46,25 @@ def __init__(
             norm="slaney",
             mel_scale="slaney",
         )
+        self._hann_window_cache = {}
+        self._mel_filters_cache = {}
+
+    def _get_cached_feature_tensors(self, device: Union[str, torch.device]):
+        device_key = str(device)
+        window = self._hann_window_cache.get(device_key)
+        if window is None:
+            window = torch.hann_window(self.n_fft, device=device)
+            self._hann_window_cache[device_key] = window
+
+        mel_filters = self._mel_filters_cache.get(device_key)
+        if mel_filters is None:
+            mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
+            self._mel_filters_cache[device_key] = mel_filters
+        return window, mel_filters
 
     def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu") -> np.ndarray:
         waveform = torch.from_numpy(waveform).to(device, torch.float32)
-        window = torch.hann_window(self.n_fft, device=device)
+        window, mel_filters = self._get_cached_feature_tensors(device)
 
         if self.dither != 0.0:
             waveform += self.dither * torch.randn(waveform.shape, dtype=waveform.dtype, device=waveform.device)
@@ -57,7 +72,6 @@ def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu
         stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
         magnitudes = stft[..., :-1].abs() ** 2
 
-        mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
         mel_spec = mel_filters.T @ magnitudes
 
         log_spec = torch.clamp(mel_spec, min=1e-10).log10()

From 713c45d912aec4b6955aaf6e55be0ef8e5705dd6 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Mon, 30 Mar 2026 08:26:52 +0000
Subject: [PATCH 08/51] Fix audio preload config to follow tokenizer settings

---
 .../common/basemodel/multimodal_tokenizer.py  |  1 +
 lightllm/models/internvl/model.py             |  5 +++++
 .../models/qwen3_omni_moe_thinker/model.py    |  5 +++++
 lightllm/server/api_http.py                   |  4 +++-
 lightllm/server/httpserver/manager.py         |  4 +++-
 lightllm/server/multimodal_params.py          | 20 +++++++++++--------
 6 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/lightllm/common/basemodel/multimodal_tokenizer.py b/lightllm/common/basemodel/multimodal_tokenizer.py
index cdcbd7f089..872a418bf7 100644
--- a/lightllm/common/basemodel/multimodal_tokenizer.py
+++ b/lightllm/common/basemodel/multimodal_tokenizer.py
@@ -33,6 +33,7 @@
 class BaseMultiModalTokenizer(ABC):
     def __init__(self, tokenizer, **kwargs):
         self.tokenizer = tokenizer
+        self.audio_preload_config = None
 
     def __getattr__(self, name):
         obj_dict = object.__getattribute__(self, "__dict__")
diff --git a/lightllm/models/internvl/model.py b/lightllm/models/internvl/model.py
index ccb76d3512..70c797aeb8 100644
--- a/lightllm/models/internvl/model.py
+++ b/lightllm/models/internvl/model.py
@@ -50,6 +50,11 @@ def __init__(self, tokenizer, model_cfg, **kwargs):
 
         self.audio_min_length = MIN_AUDIO_LEN
         self.audio_max_length = 16000 * 30
+        self.audio_preload_config = {
+            "sampling_rate": 16000,
+            "hop_length": 160,
+            "min_audio_len": int(self.audio_min_length),
+        }
 
     def init_imageitem_extral_params(
         self, img: ImageItem, multi_params: MultimodalParams, sampling_params: SamplingParams
diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py
index a1419f83ef..4a5131bbf1 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/model.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/model.py
@@ -42,6 +42,11 @@ def __init__(self, tokenizer=None, processor=None, **kwargs):
         self.sampling_rate = self.audio_processor.sampling_rate
         self.n_samples = self.audio_processor.n_samples
         self.hop_length = self.audio_processor.hop_length
+        self.audio_preload_config = {
+            "sampling_rate": int(self.sampling_rate),
+            "hop_length": int(self.hop_length),
+            "min_audio_len": int(MIN_AUDIO_LEN),
+        }
 
         self.image_start_id = kwargs["model_cfg"]["vision_start_token_id"]
         self.image_end_id = kwargs["model_cfg"]["vision_end_token_id"]
diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py
index 6be738befc..cb7619fbe5 100755
--- a/lightllm/server/api_http.py
+++ b/lightllm/server/api_http.py
@@ -272,7 +272,9 @@ async def tokens(request: Request):
 
         multimodal_params_dict = request_dict.get("multimodal_params", {})
         multimodal_params = MultimodalParams(**multimodal_params_dict)
-        await multimodal_params.verify_and_preload(request)
+        await multimodal_params.verify_and_preload(
+            request, audio_preload_config=getattr(g_objs.httpserver_manager.tokenizer, "audio_preload_config", None)
+        )
         return JSONResponse(
             {
                 "ntokens": g_objs.httpserver_manager.tokens(
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 8b3be9b0e8..9a6864774a 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -302,7 +302,9 @@ async def generate(
                 original_multimodal_params = copy.deepcopy(multimodal_params)
 
             if self.pd_mode.is_P_or_NORMAL():
-                await multimodal_params.verify_and_preload(request)
+                await multimodal_params.verify_and_preload(
+                    request, audio_preload_config=getattr(self.tokenizer, "audio_preload_config", None)
+                )
                 self._log_stage_timing(
                     group_request_id,
                     start_time,
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 13a26d9b57..440bff06c5 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -17,6 +17,8 @@
 RAW_AUDIO_SHM_FORMAT = "raw_audio_bytes"
 WAVEFORM_F32_SHM_FORMAT = "waveform_f32"
 AUDIO_SHM_USE_RAW_ENV = "LIGHTLLM_AUDIO_SHM_USE_RAW"
+DEFAULT_AUDIO_SAMPLE_RATE = 16000
+DEFAULT_MIN_AUDIO_LEN = 480
 
 
 def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float = 1.0) -> bytes:
@@ -64,7 +66,7 @@ def __init__(self, **kwargs):
         self._preload_data = None
         self.extra_params = {}
 
-    async def preload(self, request: Request):
+    async def preload(self, request: Request, audio_preload_config: dict = None):
         try:
             req_id = getattr(getattr(request, "state", None), "lightllm_req_id", None)
             preload_start = time.time()
@@ -79,14 +81,16 @@ async def preload(self, request: Request):
                 raise ValueError(f"cannot read audio which type is {self._type}!")
             source_ready_cost_ms = (time.time() - source_ready_start) * 1000.0
 
+            audio_preload_config = audio_preload_config or {}
+            target_sample_rate = int(audio_preload_config.get("sampling_rate", DEFAULT_AUDIO_SAMPLE_RATE))
+            min_audio_len = int(audio_preload_config.get("min_audio_len", DEFAULT_MIN_AUDIO_LEN))
+
             # check if valid audio bytes
             decode_start = time.time()
-            audio_values, _ = librosa.load(BytesIO(audio_data), sr=16000)
+            audio_values, _ = librosa.load(BytesIO(audio_data), sr=target_sample_rate)
             audio_values = np.asarray(audio_values, dtype=np.float32)
             decode_cost_ms = (time.time() - decode_start) * 1000.0
-            from lightllm.models.whisper.defaults import MIN_AUDIO_LEN
-
-            self.audio_length = max(audio_values.shape[0], MIN_AUDIO_LEN)  # 如果音频过短，会被pad到480的长度
+            self.audio_length = max(audio_values.shape[0], min_audio_len)
             if should_use_raw_audio_shm():
                 self.extra_params["audio_shm_format"] = RAW_AUDIO_SHM_FORMAT
                 self.extra_params.pop("audio_sample_rate", None)
@@ -94,7 +98,7 @@ async def preload(self, request: Request):
                 self._preload_data = audio_data
             else:
                 self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT
-                self.extra_params["audio_sample_rate"] = 16000
+                self.extra_params["audio_sample_rate"] = target_sample_rate
                 self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
                 self._preload_data = audio_values.tobytes()
             self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest()
@@ -221,11 +225,11 @@ def __init__(
         self.audios = [AudioItem(**a) for a in audios]
         return
 
-    async def verify_and_preload(self, request: Request):
+    async def verify_and_preload(self, request: Request, audio_preload_config: dict = None):
         for image in self.images:
             await image.preload(request)
         for audio in self.audios:
-            await audio.preload(request)
+            await audio.preload(request, audio_preload_config=audio_preload_config)
         return
 
     def to_dict(self):

From 65a3ec67bb94bd41b604e415f2e227ae35c81ef9 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 31 Mar 2026 09:27:15 +0000
Subject: [PATCH 09/51] Optimize qwen3 omni audio preprocessing fast path

---
 .../qwen3_omni_moe_thinker/audio_process.py   | 23 +++++++++++
 .../qwen3_omni_audio.py                       | 41 +++++++++++--------
 lightllm/server/multimodal_params.py          | 16 +++++++-
 3 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
index e9dc931886..42eae8edb5 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
@@ -105,6 +105,29 @@ def zero_mean_unit_var_norm(
 
         return normed_input_values
 
+    def _preprocess_single_padded(
+        self,
+        raw_speech: np.ndarray,
+        num_frames: int,
+        device: Optional[str] = "cpu",
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        waveform = np.asarray(raw_speech, dtype=np.float32)
+        if waveform.ndim != 1:
+            raise ValueError(f"single audio fast path expects 1D waveform, got shape={waveform.shape}")
+
+        extracted = self._torch_extract_fbank_features(waveform[None, :], device)
+        extracted = np.asarray(extracted, dtype=np.float32)
+        if extracted.ndim != 3:
+            raise ValueError(f"unexpected extracted feature shape={extracted.shape}")
+
+        if extracted.shape[-1] < num_frames:
+            raise ValueError(f"feature frames {extracted.shape[-1]} < requested num_frames {num_frames}")
+
+        compact_features = torch.from_numpy(extracted[:, :, :num_frames]).to(device="cuda", dtype=torch.bfloat16)
+        compact_features = compact_features[0].contiguous()
+        feature_lens = torch.tensor([num_frames], device="cuda", dtype=torch.long)
+        return compact_features, feature_lens
+
     def _preprocess(
         self,
         raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index 424a768bbf..f3cd0525eb 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -13,7 +13,7 @@
 from typing import Callable, Optional, Union, List
 from transformers.activations import ACT2FN
 
-from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload
+from lightllm.server.multimodal_params import AudioItem, WAVEFORM_F32_SHM_FORMAT, load_audio_from_shm_payload
 from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
 from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
@@ -356,21 +356,27 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
                 load_start = time.time()
                 audio_data = read_shm(get_shm_name_data(item.uuid))
                 audio = load_audio_from_shm_payload(audio_data, item.extra_params, self.processor.sampling_rate)
+                audio_num_frames = item.extra_params.get("audio_num_frames")
                 load_shm_cost += time.time() - load_start
             else:
                 raise ValueError(f"cannot read audio which type is {type(item)}!")
 
             preprocess_start = time.time()
-            input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
-            if feature_attention_mask is not None:
-                audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
-                input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
+            if audio_num_frames is not None and item.extra_params.get("audio_shm_format") == WAVEFORM_F32_SHM_FORMAT:
+                input_features, feature_lens = self.processor._preprocess_single_padded(
+                    audio, int(audio_num_frames), device="cpu"
+                )
             else:
-                audio_feature_lengths = None
-
-            feature_lens = (
-                audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
-            )
+                input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
+                if feature_attention_mask is not None:
+                    audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
+                    input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
+                else:
+                    audio_feature_lengths = None
+
+                feature_lens = (
+                    audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
+                )
             preprocess_cost += time.time() - preprocess_start
 
             forward_start = time.time()
@@ -409,14 +415,13 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
     def warmup(self, audio_bytes: bytes):
         audio = BytesIO(audio_bytes)
         audio, _ = librosa.load(audio, sr=self.processor.sampling_rate)
-        input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
-        if feature_attention_mask is not None:
-            audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
-            input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
-        else:
-            audio_feature_lengths = None
-
-        feature_lens = audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
+        num_frames = (max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length
+        padded_len = ((max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length) * (
+            self.processor.hop_length
+        )
+        if padded_len > audio.shape[0]:
+            audio = np.pad(audio, (0, padded_len - audio.shape[0]), mode="constant", constant_values=0.0)
+        input_features, feature_lens = self.processor._preprocess_single_padded(audio, num_frames, device="cpu")
         _ = self.forward(
             input_features,
             feature_lens=feature_lens,
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 440bff06c5..da5d239c6a 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -18,6 +18,7 @@
 WAVEFORM_F32_SHM_FORMAT = "waveform_f32"
 AUDIO_SHM_USE_RAW_ENV = "LIGHTLLM_AUDIO_SHM_USE_RAW"
 DEFAULT_AUDIO_SAMPLE_RATE = 16000
+DEFAULT_AUDIO_HOP_LENGTH = 160
 DEFAULT_MIN_AUDIO_LEN = 480
 
 
@@ -83,6 +84,7 @@ async def preload(self, request: Request, audio_preload_config: dict = None):
 
             audio_preload_config = audio_preload_config or {}
             target_sample_rate = int(audio_preload_config.get("sampling_rate", DEFAULT_AUDIO_SAMPLE_RATE))
+            hop_length = int(audio_preload_config.get("hop_length", DEFAULT_AUDIO_HOP_LENGTH))
             min_audio_len = int(audio_preload_config.get("min_audio_len", DEFAULT_MIN_AUDIO_LEN))
 
             # check if valid audio bytes
@@ -90,16 +92,28 @@ async def preload(self, request: Request, audio_preload_config: dict = None):
             audio_values, _ = librosa.load(BytesIO(audio_data), sr=target_sample_rate)
             audio_values = np.asarray(audio_values, dtype=np.float32)
             decode_cost_ms = (time.time() - decode_start) * 1000.0
-            self.audio_length = max(audio_values.shape[0], min_audio_len)
+            effective_audio_len = max(audio_values.shape[0], min_audio_len)
+            padded_audio_len = ((effective_audio_len + hop_length - 1) // hop_length) * hop_length
+            if padded_audio_len > audio_values.shape[0]:
+                audio_values = np.pad(
+                    audio_values,
+                    (0, padded_audio_len - audio_values.shape[0]),
+                    mode="constant",
+                    constant_values=0.0,
+                )
+
+            self.audio_length = effective_audio_len
             if should_use_raw_audio_shm():
                 self.extra_params["audio_shm_format"] = RAW_AUDIO_SHM_FORMAT
                 self.extra_params.pop("audio_sample_rate", None)
                 self.extra_params.pop("audio_num_samples", None)
+                self.extra_params.pop("audio_num_frames", None)
                 self._preload_data = audio_data
             else:
                 self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT
                 self.extra_params["audio_sample_rate"] = target_sample_rate
                 self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
+                self.extra_params["audio_num_frames"] = int((effective_audio_len + hop_length - 1) // hop_length)
                 self._preload_data = audio_values.tobytes()
             self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest()
             logger.info(

From 2e480081b77a6166a89d908ad5a465e3eaefe0fd Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 31 Mar 2026 09:27:58 +0000
Subject: [PATCH 10/51] Add audio server fast path for single pending requests

---
 lightllm/server/audioserver/manager.py | 32 ++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py
index ac4058b643..d54856c265 100644
--- a/lightllm/server/audioserver/manager.py
+++ b/lightllm/server/audioserver/manager.py
@@ -167,6 +167,38 @@ async def loop_for_fwd(self):
                     else:
                         ready_audio = obtain(self.cache_client.root.get_items_embed(audio_uuids))
 
+                    pending_audios = [audio for audio, ready in zip(multimodal_params.audios, ready_audio) if not ready]
+                    if (
+                        pending_audios
+                        and len(processing_group_reqs) == 0
+                        and len(self.waiting_reqs) == 0
+                        and len(pending_audios) < self.infer_batch_size
+                    ):
+                        logger.info(
+                            f"audio_batch_ready req_ids:[{group_req_indexes.group_req_id}] "
+                            f"audio_count:{len(pending_audios)} infer_batch_size:{self.infer_batch_size} fast_path:1"
+                        )
+                        self._log_req_stage(
+                            group_req_indexes.group_req_id,
+                            "audio_infer_start",
+                            batch_audio_count=len(pending_audios),
+                        )
+                        await self.infer_audios(pending_audios)
+                        self._log_req_stage(
+                            group_req_indexes.group_req_id,
+                            "audio_infer_done",
+                            batch_audio_count=len(pending_audios),
+                        )
+                        self._log_req_stage(
+                            group_req_indexes.group_req_id,
+                            "audio_send_to_next_module",
+                            target_port=self.next_module_port,
+                            fast_path=1,
+                        )
+                        self.send_to_next_module.send_pyobj(group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
+                        self._cleanup_req_stage(group_req_indexes.group_req_id)
+                        continue
+
                     current_req_has_pending_audio = False
                     for audio, ready in zip(multimodal_params.audios, ready_audio):
                         if not ready:

From 456a71aab0722a646945b5154d02a14420fd14a2 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Wed, 1 Apr 2026 02:46:02 +0000
Subject: [PATCH 11/51] fix num_frames

---
 lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 2 +-
 lightllm/server/multimodal_params.py                       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index f3cd0525eb..04839e0ce8 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -415,7 +415,7 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
     def warmup(self, audio_bytes: bytes):
         audio = BytesIO(audio_bytes)
         audio, _ = librosa.load(audio, sr=self.processor.sampling_rate)
-        num_frames = (max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length
+        num_frames = max(audio.shape[0], 480) // self.processor.hop_length
         padded_len = ((max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length) * (
             self.processor.hop_length
         )
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index da5d239c6a..ad70443ca7 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -113,7 +113,7 @@ async def preload(self, request: Request, audio_preload_config: dict = None):
                 self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT
                 self.extra_params["audio_sample_rate"] = target_sample_rate
                 self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
-                self.extra_params["audio_num_frames"] = int((effective_audio_len + hop_length - 1) // hop_length)
+                self.extra_params["audio_num_frames"] = int(effective_audio_len // hop_length)
                 self._preload_data = audio_values.tobytes()
             self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest()
             logger.info(

From 479367d3466aa582fb920abef62c7de9adac2abc Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 2 Apr 2026 01:55:59 +0000
Subject: [PATCH 12/51] tune fp8

---
 ...8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json |  92 ++++++++++++++++
 ...8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json |  92 ++++++++++++++++
 ...p8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json | 101 ++++++++++++++++++
 ...p8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json | 101 ++++++++++++++++++
 .../{topk_num=8}_NVIDIA_GeForce_RTX_5090.json |  46 ++++++++
 ...6,topk_num=8}_NVIDIA_GeForce_RTX_5090.json |  68 ++++++++++++
 ...6,topk_num=8}_NVIDIA_GeForce_RTX_5090.json |  62 +++++++++++
 ...orch.float16}_NVIDIA_GeForce_RTX_5090.json |  42 ++++++++
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |  46 ++++++++
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |  90 ++++++++++++++++
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |  90 ++++++++++++++++
 ...orch.float16}_NVIDIA_GeForce_RTX_5090.json |  62 +++++++++++
 ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json |  68 ++++++++++++
 13 files changed, 960 insertions(+)
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json

diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..2a46877c76
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,92 @@
+{
+  "1024": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "16384": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "512": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "800": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "8192": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..7372d5c322
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,92 @@
+{
+  "1": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "100": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 16,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "256": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "64": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 5,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..569382ce2f
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,101 @@
+{
+  "1": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "100": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": true,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": true,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": true,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "64": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": true,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": true,
+    "num_stages": 3,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..1456fd0b4b
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,101 @@
+{
+  "1024": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": true,
+    "num_stages": 4,
+    "num_warps": 8
+  },
+  "128": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "16384": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": true,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "32768": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "512": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": true,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "800": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": true,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "8192": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..0f5983241f
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,46 @@
+{
+  "1": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 1
+  },
+  "100": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "16": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "4096": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "64": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..3612e98183
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,68 @@
+{
+  "1": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 2,
+    "num_warps": 8
+  },
+  "100": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 1
+  },
+  "128": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 2,
+    "num_warps": 8
+  },
+  "16": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "2048": {
+    "BLOCK_DIM": 128,
+    "BLOCK_M": 2,
+    "NUM_STAGE": 1,
+    "num_warps": 1
+  },
+  "256": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 2
+  },
+  "32": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_DIM": 128,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 4,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 2
+  },
+  "8": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 4,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..ff46525471
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,62 @@
+{
+  "1": {
+    "BLOCK_DIM": 128,
+    "BLOCK_M": 16,
+    "NUM_STAGE": 1,
+    "num_warps": 1
+  },
+  "100": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 2,
+    "num_warps": 16
+  },
+  "1024": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 1
+  },
+  "128": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 1
+  },
+  "16": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 2,
+    "num_warps": 16
+  },
+  "2048": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "256": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 2
+  },
+  "32": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 1
+  },
+  "64": {
+    "BLOCK_DIM": 512,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "8": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 2,
+    "num_warps": 16
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..e3eb000004
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,42 @@
+{
+  "1": {
+    "num_stages": 1,
+    "num_warps": 1
+  },
+  "100": {
+    "num_stages": 2,
+    "num_warps": 1
+  },
+  "1024": {
+    "num_stages": 5,
+    "num_warps": 2
+  },
+  "128": {
+    "num_stages": 4,
+    "num_warps": 1
+  },
+  "16": {
+    "num_stages": 1,
+    "num_warps": 1
+  },
+  "2048": {
+    "num_stages": 4,
+    "num_warps": 1
+  },
+  "256": {
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "32": {
+    "num_stages": 5,
+    "num_warps": 1
+  },
+  "64": {
+    "num_stages": 5,
+    "num_warps": 1
+  },
+  "8": {
+    "num_stages": 1,
+    "num_warps": 1
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..9d20b4ea6b
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,46 @@
+{
+  "1": {
+    "num_stages": 4,
+    "num_warps": 2
+  },
+  "100": {
+    "num_stages": 1,
+    "num_warps": 1
+  },
+  "1024": {
+    "num_stages": 5,
+    "num_warps": 2
+  },
+  "128": {
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "16": {
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "2048": {
+    "num_stages": 3,
+    "num_warps": 2
+  },
+  "256": {
+    "num_stages": 2,
+    "num_warps": 2
+  },
+  "32": {
+    "num_stages": 4,
+    "num_warps": 1
+  },
+  "4096": {
+    "num_stages": 3,
+    "num_warps": 2
+  },
+  "64": {
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "8": {
+    "num_stages": 4,
+    "num_warps": 2
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..fdb476db92
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,90 @@
+{
+  "1": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 8,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "100": {
+    "BLOCK_K": 64,
+    "BLOCK_M": 32,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 8
+  },
+  "1024": {
+    "BLOCK_K": 64,
+    "BLOCK_M": 64,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 32,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_K": 256,
+    "BLOCK_M": 8,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 8
+  },
+  "2048": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 64,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 64,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "32": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 16,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 64,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 16,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 8,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..5f06f89508
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,90 @@
+{
+  "1": {
+    "BLOCK_K": 256,
+    "BLOCK_M": 8,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 8
+  },
+  "100": {
+    "BLOCK_K": 64,
+    "BLOCK_M": 32,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 8
+  },
+  "1024": {
+    "BLOCK_K": 64,
+    "BLOCK_M": 64,
+    "BLOCK_N": 128,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 32,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 16,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 5,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_K": 64,
+    "BLOCK_M": 64,
+    "BLOCK_N": 128,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 64,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 8
+  },
+  "32": {
+    "BLOCK_K": 128,
+    "BLOCK_M": 16,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_K": 64,
+    "BLOCK_M": 64,
+    "BLOCK_N": 128,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_K": 256,
+    "BLOCK_M": 32,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_K": 256,
+    "BLOCK_M": 8,
+    "BLOCK_N": 64,
+    "GROUP_M": 8,
+    "num_stages": 3,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..d0b540f69e
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,62 @@
+{
+  "1024": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 2,
+    "num_warps": 4
+  },
+  "16384": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "2048": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "512": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 1,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 32,
+    "NUM_STAGES": 1,
+    "num_warps": 8
+  },
+  "800": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 2,
+    "num_warps": 4
+  },
+  "8192": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..6c5307023b
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,68 @@
+{
+  "1024": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "128": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 4,
+    "num_warps": 4
+  },
+  "16384": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "2048": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "256": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "32768": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "512": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 2,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 1,
+    "num_warps": 1
+  },
+  "800": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "8192": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 1
+  }
+}
\ No newline at end of file

From 2c09aa270edea34b29f73cae2109103d75073c92 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 2 Apr 2026 02:15:11 +0000
Subject: [PATCH 13/51] set default model

---
 lightllm/server/api_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py
index 3d9a6bc8ed..3651bf4b64 100644
--- a/lightllm/server/api_models.py
+++ b/lightllm/server/api_models.py
@@ -187,7 +187,7 @@ def apply_loaded_defaults(cls, data: Any):
 
 
 class ChatCompletionRequest(BaseModel):
-    model: str
+    model: str = "default"
     messages: List[ChatCompletionMessageParam]
     function_call: Optional[str] = "none"
     temperature: Optional[float] = 1

From 5168dae05ca72ebfdf51ff75fd1a109310677db2 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 2 Apr 2026 08:55:53 +0000
Subject: [PATCH 14/51] add prompt_text_cache to QWen3OmniTokenizer

---
 lightllm/models/qwen3_omni_moe_thinker/model.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py
index 4a5131bbf1..6ae73fd1d1 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/model.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/model.py
@@ -1,6 +1,7 @@
 import os
 import json
 import librosa
+from collections import OrderedDict
 from io import BytesIO
 from lightllm.common.build_utils import repair_config
 from lightllm.models.registry import ModelRegistry
@@ -30,6 +31,8 @@
 class QWen3OmniTokenizer(QWen3VLTokenizer):
     def __init__(self, tokenizer=None, processor=None, **kwargs):
         self.tokenizer = tokenizer
+        self._prompt_encode_cache = OrderedDict()
+        self._prompt_encode_cache_capacity = 64
         # image
         self.image_processor = processor.image_processor
         self.min_pixel = self.image_processor.min_pixels
@@ -71,6 +74,18 @@ def get_audio_token_length(self, audio: AudioItem):
         # print(f"token_num is {token_num}  n_samples is {self.n_samples} hop_length is {self.hop_length}")
         return token_num
 
+    def _encode_prompt_text(self, prompt: str):
+        cached_ids = self._prompt_encode_cache.get(prompt)
+        if cached_ids is not None:
+            self._prompt_encode_cache.move_to_end(prompt)
+            return list(cached_ids)
+
+        origin_ids = self.tokenizer.encode(prompt)
+        self._prompt_encode_cache[prompt] = tuple(origin_ids)
+        if len(self._prompt_encode_cache) > self._prompt_encode_cache_capacity:
+            self._prompt_encode_cache.popitem(last=False)
+        return origin_ids
+
     def _caclu_audio_token_num(self, input_audio_len: int):
         _mel_len = input_audio_len // int(self.hop_length)
         input_lengths_leave = _mel_len % 100
@@ -79,7 +94,7 @@ def _caclu_audio_token_num(self, input_audio_len: int):
         return output_lengths
 
     def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs):
-        origin_ids = self.tokenizer.encode(prompt)
+        origin_ids = self._encode_prompt_text(prompt)
 
         # <img><image_pad></img> -> <img></img>
         origin_ids = [token for token in origin_ids if token not in (self.image_token_id, self.audio_token_id)]

From 167f8b0e7449cc5a15755f0fe92edb5f5e95cd7f Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 2 Apr 2026 08:56:50 +0000
Subject: [PATCH 15/51] multi images or audios use asyncio

---
 lightllm/server/multimodal_params.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index ad70443ca7..ce166b5980 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -1,4 +1,5 @@
 """Multimodal parameters for text generation."""
+import asyncio
 import os
 import wave
 import time
@@ -240,10 +241,12 @@ def __init__(
         return
 
     async def verify_and_preload(self, request: Request, audio_preload_config: dict = None):
-        for image in self.images:
-            await image.preload(request)
-        for audio in self.audios:
-            await audio.preload(request, audio_preload_config=audio_preload_config)
+        preload_coroutines = [image.preload(request) for image in self.images]
+        preload_coroutines.extend(
+            audio.preload(request, audio_preload_config=audio_preload_config) for audio in self.audios
+        )
+        if preload_coroutines:
+            await asyncio.gather(*preload_coroutines)
         return
 
     def to_dict(self):

From 30d86034554ade2d9fe350986c76a31526d2b4cc Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 2 Apr 2026 08:58:05 +0000
Subject: [PATCH 16/51] single file without _resource_lock

---
 lightllm/server/httpserver/manager.py | 107 +++++++++++++++++---------
 1 file changed, 69 insertions(+), 38 deletions(-)

diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 9a6864774a..d7490ebfcd 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -131,6 +131,36 @@ def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str
         logger.info(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}")
         return
 
+    def _prepare_multimodal_resource_inputs(
+        self, multimodal_params: MultimodalParams, sampling_params: SamplingParams
+    ) -> Tuple[List[Union[ImageItem, AudioItem]], List[str], List[int], List[bytes]]:
+        items, md5sums, tokens_nums, datas = [], [], [], []
+
+        for img in multimodal_params.images:
+            self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
+            data = img.read()
+            token_num = self.tokenizer.get_image_token_length(img)
+            md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
+            md5sums.append(md5sum)
+            tokens_nums.append(token_num)
+            datas.append(data)
+            items.append(img)
+
+        for audio in multimodal_params.audios:
+            self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
+            data = audio.read()
+            token_num = self.tokenizer.get_audio_token_length(audio)
+            payload_md5 = audio.extra_params.get("audio_payload_md5")
+            if payload_md5 is None:
+                payload_md5 = hashlib.md5(data).hexdigest()
+            md5sum = payload_md5 + "_" + str(hash(frozendict(audio.extra_params)))
+            md5sums.append(md5sum)
+            tokens_nums.append(token_num)
+            datas.append(data)
+            items.append(audio)
+
+        return items, md5sums, tokens_nums, datas
+
     async def _alloc_resource(self, items, md5sums, token_nums, datas):
 
         while True:
@@ -163,34 +193,16 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas):
     async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams):
         # 只有 P 和 NORMAL 节点需要真的管理多模态资源
         if self.pd_mode.is_P_or_NORMAL():
+            items, md5sums, tokens_nums, datas = self._prepare_multimodal_resource_inputs(
+                multimodal_params, sampling_params
+            )
+            if len(items) <= 1:
+                await self._alloc_resource(items, md5sums, tokens_nums, datas)
+                return
             # 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity，从而造成死锁的问题。
             # 如果不加任何锁，假如请求1和请求2都有6张图片，而cache_capacity为10，
             # 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图，将会资源竞争产生死锁。
             async with self._resource_lock:
-                items, md5sums, tokens_nums, datas = [], [], [], []
-                for img in multimodal_params.images:
-                    self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
-                    data = img.read()
-                    # must after init_imageitem_extral_params
-                    token_num = self.tokenizer.get_image_token_length(img)
-                    md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
-                    md5sums.append(md5sum)
-                    tokens_nums.append(token_num)
-                    datas.append(data)
-                    items.append(img)
-                for audio in multimodal_params.audios:
-                    self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
-                    data = audio.read()
-                    token_num = self.tokenizer.get_audio_token_length(audio)
-                    payload_md5 = audio.extra_params.get("audio_payload_md5")
-                    if payload_md5 is None:
-                        payload_md5 = hashlib.md5(data).hexdigest()
-                    md5sum = payload_md5 + "_" + str(hash(frozendict(audio.extra_params)))
-                    md5sums.append(md5sum)
-                    tokens_nums.append(token_num)
-                    datas.append(data)
-                    items.append(audio)
-
                 await self._alloc_resource(items, md5sums, tokens_nums, datas)
         return
 
@@ -295,6 +307,13 @@ async def generate(
             request.state.lightllm_req_id = group_request_id
         audio_count = len(multimodal_params.audios) if multimodal_params is not None else 0
         image_count = len(multimodal_params.images) if multimodal_params is not None else 0
+        self._log_stage_timing(
+            group_request_id,
+            start_time,
+            "received",
+            has_audio=audio_count > 0,
+            has_image=image_count > 0,
+        )
 
         try:
             original_multimodal_params = None
@@ -316,7 +335,7 @@ async def generate(
             # 记录请求到达的相关信息
             await self._log_req_header(request_headers, group_request_id)
             # encode
-            prompt_ids = await self._encode(prompt, multimodal_params, sampling_params)
+            prompt_ids = await self._encode(prompt, multimodal_params, sampling_params, start_time=start_time)
             self._log_stage_timing(
                 group_request_id,
                 start_time,
@@ -481,7 +500,11 @@ async def _log_req_header(self, request_headers, group_request_id: int):
         return
 
     async def _encode(
-        self, prompt: Union[str, List[int]], multimodal_params: MultimodalParams, sampling_params: SamplingParams
+        self,
+        prompt: Union[str, List[int]],
+        multimodal_params: MultimodalParams,
+        sampling_params: SamplingParams,
+        start_time: Optional[float] = None,
     ):
         if isinstance(prompt, str):
             if self.enable_multimodal:
@@ -490,15 +513,23 @@ async def _encode(
                 ), "too many multimodal items!"
                 if multimodal_params.audios:
                     assert not self.args.disable_audio, "audio multimodal not enabled"
-                encode_start_time = time.time()
                 await self._alloc_multimodal_resources(multimodal_params, sampling_params)
                 log_req_id = getattr(sampling_params, "group_request_id", None)
-                logger.info(
-                    f"lightllm_req_id:{log_req_id} "
-                    f"stage:alloc_multimodal_resources_done "
-                    f"elapsed_ms:{(time.time() - encode_start_time) * 1000.0:.3f} "
-                    f"audio_count:{len(multimodal_params.audios)} image_count:{len(multimodal_params.images)}"
-                )
+                if start_time is None:
+                    logger.info(
+                        f"lightllm_req_id:{log_req_id} "
+                        f"stage:alloc_multimodal_resources_done "
+                        f"elapsed_ms:0.000 "
+                        f"audio_count:{len(multimodal_params.audios)} image_count:{len(multimodal_params.images)}"
+                    )
+                else:
+                    self._log_stage_timing(
+                        log_req_id,
+                        start_time,
+                        "alloc_multimodal_resources_done",
+                        audio_count=len(multimodal_params.audios),
+                        image_count=len(multimodal_params.images),
+                    )
                 prompt_ids = self.tokenizer.encode(
                     prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens
                 )
@@ -592,7 +623,7 @@ async def transfer_to_next_module(
 
         if self.pd_mode.is_P_or_NORMAL():
             if not self.args.disable_vision:
-                logger.info(
+                logger.debug(
                     f"lightllm_req_id:{group_req_objs.group_req_id} "
                     f"stage:transfer_to_visual "
                     f"target_port:{self.args.visual_port}"
@@ -601,7 +632,7 @@ async def transfer_to_next_module(
                 return
 
             if not self.args.disable_audio:
-                logger.info(
+                logger.debug(
                     f"lightllm_req_id:{group_req_objs.group_req_id} "
                     f"stage:transfer_to_audio "
                     f"target_port:{self.args.audio_port}"
@@ -610,7 +641,7 @@ async def transfer_to_next_module(
                 return
 
             if self.args.enable_cpu_cache:
-                logger.info(
+                logger.debug(
                     f"lightllm_req_id:{group_req_objs.group_req_id} "
                     f"stage:transfer_to_multi_level_kv_cache target_port:{self.args.multi_level_kv_cache_port}"
                 )
@@ -620,7 +651,7 @@ async def transfer_to_next_module(
                 )
                 return
 
-            logger.info(
+            logger.debug(
                 f"lightllm_req_id:{group_req_objs.group_req_id} "
                 f"stage:transfer_to_router "
                 f"target_port:{self.args.router_port}"
@@ -633,7 +664,7 @@ async def transfer_to_next_module(
 
         if self.pd_mode.is_D():
             # 在 D 模式下，不需要传输真的多模态参数，因为其已经被 P 处理好了
-            logger.info(
+            logger.debug(
                 f"lightllm_req_id:{group_req_objs.group_req_id} "
                 f"stage:transfer_to_router_from_decode "
                 f"target_port:{self.args.router_port}"

From db3e63b4ddb827003371d4e14650cdd3374415d3 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 2 Apr 2026 09:00:00 +0000
Subject: [PATCH 17/51] use deque instead of list

---
 lightllm/server/audioserver/manager.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py
index d54856c265..a8ccb29891 100644
--- a/lightllm/server/audioserver/manager.py
+++ b/lightllm/server/audioserver/manager.py
@@ -8,6 +8,7 @@
 import inspect
 import setproctitle
 import time
+from collections import deque
 from typing import Dict, List
 
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
@@ -43,7 +44,7 @@ def __init__(
         self.cache_client = rpyc.connect("localhost", args.cache_port, config={"allow_pickle": True})
         self.cache_client._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
         self.cache_port = args.cache_port
-        self.waiting_reqs: List[GroupReqIndexes] = []
+        self.waiting_reqs = deque()
         self.model_weightdir = args.model_dir
         self.tp_world_size = args.tp
         self.audio_dp = args.audio_dp
@@ -140,7 +141,7 @@ async def loop_for_fwd(self):
                 processing_group_reqs = []
                 audios_need_infer = []
                 while len(self.waiting_reqs) > 0:
-                    group_req_indexes = self.waiting_reqs.pop(0)
+                    group_req_indexes = self.waiting_reqs.popleft()
                     self._log_req_stage(
                         group_req_indexes.group_req_id,
                         "audio_queue_picked",
@@ -174,7 +175,7 @@ async def loop_for_fwd(self):
                         and len(self.waiting_reqs) == 0
                         and len(pending_audios) < self.infer_batch_size
                     ):
-                        logger.info(
+                        logger.debug(
                             f"audio_batch_ready req_ids:[{group_req_indexes.group_req_id}] "
                             f"audio_count:{len(pending_audios)} infer_batch_size:{self.infer_batch_size} fast_path:1"
                         )
@@ -210,7 +211,7 @@ async def loop_for_fwd(self):
                                 [group_req_indexes] if current_req_has_pending_audio else []
                             )
                             batch_req_ids = [req.group_req_id for req in batch_reqs]
-                            logger.info(
+                            logger.debug(
                                 f"audio_batch_ready req_ids:{batch_req_ids} "
                                 f"audio_count:{len(audios_need_infer)} infer_batch_size:{self.infer_batch_size}"
                             )
@@ -250,7 +251,7 @@ async def loop_for_fwd(self):
 
                 if len(audios_need_infer) > 0:
                     batch_req_ids = [req.group_req_id for req in processing_group_reqs]
-                    logger.info(
+                    logger.debug(
                         f"audio_batch_ready req_ids:{batch_req_ids} "
                         f"audio_count:{len(audios_need_infer)} infer_batch_size:{self.infer_batch_size}"
                     )
@@ -274,7 +275,7 @@ async def loop_for_netio_req(self):
         while True:
             recv_req: GroupReqIndexes = await self.zmq_recv_socket.recv_pyobj()
             if isinstance(recv_req, GroupReqIndexes):
-                logger.info(
+                logger.debug(
                     f"audio recv req id {recv_req.group_req_id} "
                     f"audio count {len(recv_req.multimodal_params.audios)}"
                 )

From 878c2f938267f81fb5edc989f31ad93659758df4 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Fri, 3 Apr 2026 04:50:45 +0000
Subject: [PATCH 18/51] chore: format merged audio/httpserver files

---
 lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 2 +-
 lightllm/models/whisper/whisper_audio.py                   | 3 ++-
 lightllm/server/httpserver/manager.py                      | 6 ------
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index c14df5ff9d..c08dd68a2f 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -101,7 +101,7 @@ def __init__(self, d_model, encoder_attention_heads, attention_dropout):
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                 f" and `num_heads`: {self.num_heads})."
             )
-        self.scaling = self.head_dim ** -0.5
+        self.scaling = self.head_dim**-0.5
         self.attention_dropout = 0.0
         self.is_decoder = False
         self.is_causal = False
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
index 7eb2948281..750bf8e158 100644
--- a/lightllm/models/whisper/whisper_audio.py
+++ b/lightllm/models/whisper/whisper_audio.py
@@ -11,6 +11,7 @@
 from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
 from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload
 
+
 # tokenizer_class removed
 class WhisperProcessor(ProcessorMixin):
     r"""
@@ -38,7 +39,7 @@ def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
         return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
 
     def get_T_after_cnn(self, L_in, dilation=1):
-        for (padding, kernel_size, stride) in eval("[(1,3,1)] + [(1,3,2)] "):
+        for padding, kernel_size, stride in eval("[(1,3,1)] + [(1,3,2)] "):
             L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
             L_out = 1 + L_out // stride
             L_in = L_out
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 94065cfc3a..c9eb4de543 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -164,7 +164,6 @@ def _prepare_multimodal_resource_inputs(
         return items, md5sums, tokens_nums, datas
 
     async def _alloc_resource(self, items, md5sums, token_nums, datas):
-
         while True:
             records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
 
@@ -489,7 +488,6 @@ def _count_multimodal_tokens(self, multimodal_params: MultimodalParams) -> Tuple
         return image_tokens, audio_tokens
 
     async def _log_req_header(self, request_headers, group_request_id: int):
-
         x_request_id = request_headers.get("X-Request-Id", "")
         x_session_id = request_headers.get("X-Session-Id", "")
 
@@ -622,7 +620,6 @@ async def transfer_to_next_module(
         self,
         group_req_objs: Optional[GroupReqObjs] = None,
     ):
-
         if self.pd_mode.is_P_or_NORMAL():
             if not self.args.disable_vision:
                 logger.debug(
@@ -689,7 +686,6 @@ async def _wait_to_token_package(
         req_status: "ReqStatus",
         request: Request,
     ):
-
         event = req_status.event
         unfinished_count = sampling_params.best_of
         out_token_counter = 0
@@ -820,7 +816,6 @@ async def recycle_resource_loop(self):
         pre_time_mark = time.time()
 
         while True:
-
             try:
                 await asyncio.wait_for(self.recycle_event.wait(), timeout=0.02)
             except asyncio.TimeoutError:
@@ -897,7 +892,6 @@ async def handle_loop(self):
 
                         for _ in range(read_token_count):
                             if not req.out_tokens_queue.is_empty():
-
                                 text, src_index, special, count_output_tokens = req.out_tokens_queue.peek()
                                 req.cumlogprob += float(req.shm_logprobs.arr[src_index])
                                 metadata = {

From ab788d9c41e2311f7cc1f5c41ea2bd2ec849d6db Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Fri, 3 Apr 2026 04:59:16 +0000
Subject: [PATCH 19/51] chore: improve qwen3 omni audio formatting

---
 lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index c08dd68a2f..c14df5ff9d 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -101,7 +101,7 @@ def __init__(self, d_model, encoder_attention_heads, attention_dropout):
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                 f" and `num_heads`: {self.num_heads})."
             )
-        self.scaling = self.head_dim**-0.5
+        self.scaling = self.head_dim ** -0.5
         self.attention_dropout = 0.0
         self.is_decoder = False
         self.is_causal = False

From 0570b965a732e2f5a0655164c82844d150a35bdb Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Fri, 3 Apr 2026 05:43:13 +0000
Subject: [PATCH 20/51] =?UTF-8?q?fix=C3=A2=C2=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lightllm/server/audioserver/model_infer/model_rpc.py | 12 ++++++++++++
 lightllm/server/core/objs/start_args_type.py         |  2 --
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py
index 76d5787b48..343816e1fd 100644
--- a/lightllm/server/audioserver/model_infer/model_rpc.py
+++ b/lightllm/server/audioserver/model_infer/model_rpc.py
@@ -158,6 +158,10 @@ def _get_audio_items_from_infer_queue(self, max_num: int, force_same: bool = Fal
         return tasks
 
     def _get_audio_items_from_store_queue(self, max_num: int) -> List[AudioItem]:
+        """
+        与 visual 的 _get_image_items_from_store_queue 一致：store 队列中单条为 AudioItem，
+        按批取出至多 max_num 条。
+        """
         tasks = []
         task = self.store_queue.get(block=True)
         tasks.append(task)
@@ -172,6 +176,9 @@ def _get_audio_items_from_store_queue(self, max_num: int) -> List[AudioItem]:
         return tasks
 
     def _infer_worker(self):
+        """
+        与 visual _infer_worker 一致：推理后对每个 item 单独放入 store_queue，由 store 线程批处理再 commit。
+        """
         torch.cuda.set_device(self.device_id)
         while True:
             try:
@@ -190,6 +197,7 @@ def _infer_worker(self):
 
                 self._save_to_cpu_cache(all_embeds=all_embeds, audios=audios)
 
+                # 与 visual _store_to_cpu_cache 相同条入队，便于 store 侧按 infer_max_batch_size 聚合
                 for audio in audios:
                     self.store_queue.put(audio)
 
@@ -208,6 +216,7 @@ def _save_to_cpu_cache(self, all_embeds: List[torch.Tensor], audios: List[AudioI
         return
 
     def _commit_to_cpu_cache(self, audios: List[AudioItem]):
+        # 与 visual _commit_to_cpu_cache：仅 tp0 通知完成；embed 已在 model.encode 内写入 cache
         if self.tp_rank_id == 0:
             for audio in audios:
                 audio.cuda_event.synchronize()
@@ -221,6 +230,9 @@ def _commit_to_cpu_cache(self, audios: List[AudioItem]):
             self._log_latency(audios[0], "set_items_embed")
 
     def _store_worker(self):
+        """
+        与 visual _store_worker 一致：从 store 队列按批取 AudioItem，再 commit 并释放信号量。
+        """
         while True:
             try:
                 audios: List[AudioItem] = self._get_audio_items_from_store_queue(max_num=self.infer_max_batch_size)
diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
index a920a09710..ac9bd9e180 100644
--- a/lightllm/server/core/objs/start_args_type.py
+++ b/lightllm/server/core/objs/start_args_type.py
@@ -107,12 +107,10 @@ class StartArgs:
     push_interval: int = field(default=10)
     visual_node_id: int = field(default=None)
     visual_infer_batch_size: int = field(default=None)
-    audio_infer_batch_size: int = field(default=None)
     visual_send_batch_size: int = field(default=1)
     visual_gpu_ids: List[int] = field(default_factory=lambda: [0])
     visual_tp: int = field(default=1)
     visual_dp: int = field(default=1)
-    audio_dp: int = field(default=1)
     visual_nccl_ports: List[int] = field(default=None)
     visual_rpyc_port: Optional[int] = field(default=None)
     audio_gpu_ids: Optional[List[int]] = field(default=None)

From 70aad721087731a2253a7b88a631a9994b53f3c5 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Fri, 3 Apr 2026 06:36:43 +0000
Subject: [PATCH 21/51] fix

---
 .../qwen3next/triton_kernel/causal_conv1d.py  | 20 +++++++++++++++++--
 lightllm/server/api_cli.py                    |  4 ----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
index c6d099a2d8..3371aca71a 100644
--- a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
+++ b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
@@ -4,8 +4,20 @@
 
 import torch
 
-from sgl_kernel import causal_conv1d_fwd
-from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    from sgl_kernel import causal_conv1d_fwd
+    from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
+except ImportError:
+    causal_conv1d_fwd = None
+    causal_conv1d_update_kernel = None
+    logger.warning(
+        "sgl_kernel is not installed, qwen3next/qwen3.5 causal_conv1d kernels are unavailable. "
+        "Install `sgl_kernel` before serving those models."
+    )
 
 
 def causal_conv1d_fn(
@@ -51,6 +63,8 @@ def causal_conv1d_fn(
     """
     if activation not in [None, "silu", "swish"]:
         raise NotImplementedError("activation must be None, silu, or swish")
+    if causal_conv1d_fwd is None:
+        raise ImportError("sgl_kernel is required for qwen3next/qwen3.5 causal_conv1d_fn")
     if x.stride(-1) != 1:
         x = x.contiguous()
     bias = bias.contiguous() if bias is not None else None
@@ -103,6 +117,8 @@ def causal_conv1d_update(
     """
     if activation not in [None, "silu", "swish"]:
         raise NotImplementedError(f"activation must be None, silu, or swish, actual: {activation}")
+    if causal_conv1d_update_kernel is None:
+        raise ImportError("sgl_kernel is required for qwen3next/qwen3.5 causal_conv1d_update")
     activation_val = activation in ["silu", "swish"]
     unsqueeze = x.dim() == 2
     if unsqueeze:
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
index 01bf4d306b..89aeeec833 100644
--- a/lightllm/server/api_cli.py
+++ b/lightllm/server/api_cli.py
@@ -460,9 +460,6 @@ def make_argument_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--visual_infer_batch_size", type=int, default=None, help="number of images to process in each inference batch"
     )
-    parser.add_argument(
-        "--audio_infer_batch_size", type=int, default=None, help="number of audios to process in each inference batch"
-    )
     parser.add_argument(
         "--visual_send_batch_size",
         type=int,
@@ -477,7 +474,6 @@ def make_argument_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument("--visual_tp", type=int, default=1, help="number of tensort parallel instances for ViT")
     parser.add_argument("--visual_dp", type=int, default=1, help="number of data parallel instances for ViT")
-    parser.add_argument("--audio_dp", type=int, default=1, help="number of data parallel instances for audio encoder")
     parser.add_argument(
         "--visual_nccl_ports",
         nargs="+",

From 86a16f708d42395e8692022ae28a8805fbcb1b27 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 02:50:53 +0000
Subject: [PATCH 22/51] fix md5 and

---
 lightllm/server/httpserver/manager.py | 79 +++++++++++----------------
 1 file changed, 31 insertions(+), 48 deletions(-)

diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index c9eb4de543..e9843c8237 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -128,41 +128,9 @@ def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str
         cost_ms = (time.time() - start_time) * 1000.0
         extras = " ".join(f"{k}:{v}" for k, v in kwargs.items())
         suffix = f" {extras}" if extras else ""
-        logger.info(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}")
+        logger.debug(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}")
         return
 
-    def _prepare_multimodal_resource_inputs(
-        self, multimodal_params: MultimodalParams, sampling_params: SamplingParams
-    ) -> Tuple[List[Union[ImageItem, AudioItem]], List[str], List[int], List[bytes]]:
-        items, md5sums, tokens_nums, datas = [], [], [], []
-
-        for img in multimodal_params.images:
-            self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
-            data = img.read()
-            token_num = self.tokenizer.get_image_token_length(img)
-            md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
-            img.md5 = md5sum
-            md5sums.append(md5sum)
-            tokens_nums.append(token_num)
-            datas.append(data)
-            items.append(img)
-
-        for audio in multimodal_params.audios:
-            self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
-            data = audio.read()
-            token_num = self.tokenizer.get_audio_token_length(audio)
-            payload_md5 = audio.extra_params.get("audio_payload_md5")
-            if payload_md5 is None:
-                payload_md5 = hashlib.md5(data).hexdigest()
-            md5sum = payload_md5 + "_" + str(hash(frozendict(audio.extra_params)))
-            audio.md5 = md5sum
-            md5sums.append(md5sum)
-            tokens_nums.append(token_num)
-            datas.append(data)
-            items.append(audio)
-
-        return items, md5sums, tokens_nums, datas
-
     async def _alloc_resource(self, items, md5sums, token_nums, datas):
         while True:
             records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
@@ -197,6 +165,29 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
             items, md5sums, tokens_nums, datas = self._prepare_multimodal_resource_inputs(
                 multimodal_params, sampling_params
             )
+            items, md5sums, tokens_nums, datas = [], [], [], []
+            for img in multimodal_params.images:
+                self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
+                data = img.read()
+                token_num = self.tokenizer.get_image_token_length(img)
+                md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
+                img.md5 = md5sum
+                md5sums.append(md5sum)
+                tokens_nums.append(token_num)
+                datas.append(data)
+                items.append(img)
+            for audio in multimodal_params.audios:
+                self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
+                data = audio.read()
+                token_num = self.tokenizer.get_audio_token_length(audio)
+                payload_md5 = audio.extra_params.get("audio_payload_md5")
+                md5sum = payload_md5
+                audio.md5 = md5sum
+                md5sums.append(md5sum)
+                tokens_nums.append(token_num)
+                datas.append(data)
+                items.append(audio)
+
             if len(items) <= 1:
                 await self._alloc_resource(items, md5sums, tokens_nums, datas)
                 return
@@ -515,21 +506,13 @@ async def _encode(
                     assert not self.args.disable_audio, "audio multimodal not enabled"
                 await self._alloc_multimodal_resources(multimodal_params, sampling_params)
                 log_req_id = getattr(sampling_params, "group_request_id", None)
-                if start_time is None:
-                    logger.info(
-                        f"lightllm_req_id:{log_req_id} "
-                        f"stage:alloc_multimodal_resources_done "
-                        f"elapsed_ms:0.000 "
-                        f"audio_count:{len(multimodal_params.audios)} image_count:{len(multimodal_params.images)}"
-                    )
-                else:
-                    self._log_stage_timing(
-                        log_req_id,
-                        start_time,
-                        "alloc_multimodal_resources_done",
-                        audio_count=len(multimodal_params.audios),
-                        image_count=len(multimodal_params.images),
-                    )
+                self._log_stage_timing(
+                    log_req_id,
+                    start_time,
+                    "alloc_multimodal_resources_done",
+                    audio_count=len(multimodal_params.audios),
+                    image_count=len(multimodal_params.images),
+                )
                 prompt_ids = self.tokenizer.encode(
                     prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens
                 )

From 46016378357ff382ecc492405bd9c3cdfc4ee6c9 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 02:51:43 +0000
Subject: [PATCH 23/51] fix md5

---
 lightllm/server/multimodal_params.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index ce20e5d657..0aac1874c8 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -13,6 +13,8 @@
 from fastapi import Request
 from lightllm.utils.multimodal_utils import fetch_resource
 from lightllm.utils.log_utils import init_logger
+from frozendict import frozendict
+
 
 logger = init_logger(__name__)
 RAW_AUDIO_SHM_FORMAT = "raw_audio_bytes"
@@ -118,7 +120,7 @@ async def preload(self, request: Request, audio_preload_config: dict = None):
                 self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
                 self.extra_params["audio_num_frames"] = int(effective_audio_len // hop_length)
                 self._preload_data = audio_values.tobytes()
-            self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest()
+            self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params)))
             logger.info(
                 f"lightllm_req_id:{req_id} stage:audio_preload_done "
                 f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} "

From 16203e4510f89d23acca2d81d4862975eed82d4c Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 03:00:13 +0000
Subject: [PATCH 24/51] format

---
 lightllm/server/multimodal_params.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 0aac1874c8..e62e73fade 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -120,7 +120,9 @@ async def preload(self, request: Request, audio_preload_config: dict = None):
                 self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
                 self.extra_params["audio_num_frames"] = int(effective_audio_len // hop_length)
                 self._preload_data = audio_values.tobytes()
-            self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params)))
+            self.extra_params["audio_payload_md5"] = (
+                hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params)))
+            )
             logger.info(
                 f"lightllm_req_id:{req_id} stage:audio_preload_done "
                 f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} "

From 93421d28662a45bc8ac86e658561a33a612612ef Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 03:13:51 +0000
Subject: [PATCH 25/51] using asyncio.to_thread preventing the server from
 handling other concurrent requests

---
 lightllm/server/multimodal_params.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index e62e73fade..5847975878 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -94,7 +94,7 @@ async def preload(self, request: Request, audio_preload_config: dict = None):
 
             # check if valid audio bytes
             decode_start = time.time()
-            audio_values, _ = librosa.load(BytesIO(audio_data), sr=target_sample_rate)
+            audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=target_sample_rate)
             audio_values = np.asarray(audio_values, dtype=np.float32)
             decode_cost_ms = (time.time() - decode_start) * 1000.0
             effective_audio_len = max(audio_values.shape[0], min_audio_len)

From f7b05898d0948404d685ab5094ed4c1aab2bd27e Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 06:06:25 +0000
Subject: [PATCH 26/51] fix

---
 lightllm/server/httpserver/manager.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index e9843c8237..0e4a9b79eb 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -162,9 +162,6 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas):
     async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams):
         # 只有 P 和 NORMAL 节点需要真的管理多模态资源
         if self.pd_mode.is_P_or_NORMAL():
-            items, md5sums, tokens_nums, datas = self._prepare_multimodal_resource_inputs(
-                multimodal_params, sampling_params
-            )
             items, md5sums, tokens_nums, datas = [], [], [], []
             for img in multimodal_params.images:
                 self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)

From 0ea215605bec31a05c33b8b9b6ea1832a8ac6464 Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Tue, 7 Apr 2026 08:52:11 +0000
Subject: [PATCH 27/51] fix

---
 lightllm/utils/multimodal_utils.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/lightllm/utils/multimodal_utils.py b/lightllm/utils/multimodal_utils.py
index 5f4fd18516..4b49ea8891 100644
--- a/lightllm/utils/multimodal_utils.py
+++ b/lightllm/utils/multimodal_utils.py
@@ -4,14 +4,11 @@
 import httpx
 from PIL import Image
 from io import BytesIO
-from urllib.parse import urlparse
-from typing import Dict, Optional
 from fastapi import Request
+from functools import lru_cache
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
-_HTTP_CLIENTS: Dict[Optional[str], httpx.AsyncClient] = {}
-_LOCAL_RESOURCE_HOSTS = {"127.0.0.1", "localhost", "::1"}
 
 
 def _httpx_async_client_proxy_kwargs(proxy) -> dict:
@@ -39,15 +36,17 @@ def image2base64(img_str: str):
     return base64.b64encode(buffer.getvalue()).decode("utf-8")
 
 
+@lru_cache(maxsize=256)
+def _get_xhttp_client(proxy=None):
+    kvargs = _httpx_async_client_proxy_kwargs(proxy)
+    kvargs["limits"] = httpx.Limits(max_connections=10000, max_keepalive_connections=20)
+    return httpx.AsyncClient(**kvargs)
+
+
 async def fetch_resource(url, request: Request, timeout, proxy=None):
     logger.info(f"Begin to download resource from url: {url}")
     start_time = time.time()
-    hostname = urlparse(url).hostname
-    effective_proxy = None if hostname in _LOCAL_RESOURCE_HOSTS else proxy
-    client = _HTTP_CLIENTS.get(effective_proxy)
-    if client is None:
-        client = httpx.AsyncClient(**_httpx_async_client_proxy_kwargs(effective_proxy))
-        _HTTP_CLIENTS[effective_proxy] = client
+    client = _get_xhttp_client(proxy)
     async with client.stream("GET", url, timeout=timeout) as response:
         response.raise_for_status()
         ans_bytes = []

From 6856540018aff2b4614b64e1af88374da721ac84 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 13:20:21 +0000
Subject: [PATCH 28/51] fix

---
 .../qwen3next/triton_kernel/causal_conv1d.py  | 23 ++++---------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
index 3371aca71a..2bf325340f 100644
--- a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
+++ b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
@@ -4,21 +4,6 @@
 
 import torch
 
-from lightllm.utils.log_utils import init_logger
-
-logger = init_logger(__name__)
-
-try:
-    from sgl_kernel import causal_conv1d_fwd
-    from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
-except ImportError:
-    causal_conv1d_fwd = None
-    causal_conv1d_update_kernel = None
-    logger.warning(
-        "sgl_kernel is not installed, qwen3next/qwen3.5 causal_conv1d kernels are unavailable. "
-        "Install `sgl_kernel` before serving those models."
-    )
-
 
 def causal_conv1d_fn(
     x: torch.Tensor,
@@ -63,8 +48,8 @@ def causal_conv1d_fn(
     """
     if activation not in [None, "silu", "swish"]:
         raise NotImplementedError("activation must be None, silu, or swish")
-    if causal_conv1d_fwd is None:
-        raise ImportError("sgl_kernel is required for qwen3next/qwen3.5 causal_conv1d_fn")
+    from sgl_kernel import causal_conv1d_fwd
+
     if x.stride(-1) != 1:
         x = x.contiguous()
     bias = bias.contiguous() if bias is not None else None
@@ -117,8 +102,8 @@ def causal_conv1d_update(
     """
     if activation not in [None, "silu", "swish"]:
         raise NotImplementedError(f"activation must be None, silu, or swish, actual: {activation}")
-    if causal_conv1d_update_kernel is None:
-        raise ImportError("sgl_kernel is required for qwen3next/qwen3.5 causal_conv1d_update")
+    from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
+
     activation_val = activation in ["silu", "swish"]
     unsqueeze = x.dim() == 2
     if unsqueeze:

From 9d0671b7ba3b01c995f8a4e4fefef7fb94d80f8d Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 13:26:37 +0000
Subject: [PATCH 29/51] use details_log to log

---
 lightllm/server/httpserver/manager.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 0e4a9b79eb..e2a0dbc4b6 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -125,10 +125,11 @@ def __init__(
         return
 
     def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str, **kwargs):
-        cost_ms = (time.time() - start_time) * 1000.0
-        extras = " ".join(f"{k}:{v}" for k, v in kwargs.items())
-        suffix = f" {extras}" if extras else ""
-        logger.debug(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}")
+        if self.args.detail_log:
+            cost_ms = (time.time() - start_time) * 1000.0
+            extras = " ".join(f"{k}:{v}" for k, v in kwargs.items())
+            suffix = f" {extras}" if extras else ""
+            logger.debug(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}")
         return
 
     async def _alloc_resource(self, items, md5sums, token_nums, datas):

From 8e21207325fd8205cedebf6c9f30efa60a152bbb Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 13:40:33 +0000
Subject: [PATCH 30/51] delete warmup

---
 .../qwen3_omni_audio.py                       | 24 ---------------
 lightllm/models/whisper/whisper_audio.py      | 20 -------------
 lightllm/server/api_http.py                   |  7 +----
 .../audioserver/model_infer/model_rpc.py      | 30 -------------------
 lightllm/server/multimodal_params.py          | 22 --------------
 5 files changed, 1 insertion(+), 102 deletions(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index c14df5ff9d..7d525915af 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -2,9 +2,7 @@
 import json
 import math
 import torch
-import librosa
 import numpy as np
-from io import BytesIO
 from torch import Tensor, nn
 from safetensors import safe_open
 from torch.nn import functional as F
@@ -16,10 +14,6 @@
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
 from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor
-from lightllm.utils.log_utils import init_logger
-
-
-logger = init_logger(__name__)
 
 
 def _get_feat_extract_output_lengths(input_lengths):
@@ -376,21 +370,3 @@ def encode(self, audio_items: List[AudioItem]):
             all_embeds.append(cur_embed)
 
         return all_embeds, audio_items
-
-    @torch.no_grad()
-    def warmup(self, audio_bytes: bytes):
-        audio = BytesIO(audio_bytes)
-        audio, _ = librosa.load(audio, sr=self.processor.sampling_rate)
-        num_frames = max(audio.shape[0], 480) // self.processor.hop_length
-        padded_len = ((max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length) * (
-            self.processor.hop_length
-        )
-        if padded_len > audio.shape[0]:
-            audio = np.pad(audio, (0, padded_len - audio.shape[0]), mode="constant", constant_values=0.0)
-        input_features, feature_lens = self.processor._preprocess_single_padded(audio, num_frames, device="cpu")
-        _ = self.forward(
-            input_features,
-            feature_lens=feature_lens,
-        )
-        torch.cuda.current_stream().synchronize()
-        return
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
index 750bf8e158..4cd9619e55 100644
--- a/lightllm/models/whisper/whisper_audio.py
+++ b/lightllm/models/whisper/whisper_audio.py
@@ -1,10 +1,8 @@
 import os
 import json
-import librosa
 import numpy as np
 import torch
 import torch.nn.functional as F
-from io import BytesIO
 from typing import List, Union
 from safetensors.torch import load_file
 from transformers.processing_utils import ProcessorMixin
@@ -225,21 +223,3 @@ def encode(self, audio_items: List[AudioItem]):
             ans_embeds.append(cur_embed)
 
         return ans_embeds, audio_items
-
-    @torch.no_grad()
-    def warmup(self, audio_bytes: bytes):
-        audio = BytesIO(audio_bytes)
-        audio, _ = librosa.load(audio, sr=16000)
-
-        from .defaults import MIN_AUDIO_LEN
-
-        if audio.shape[0] < MIN_AUDIO_LEN:
-            audio = np.pad(audio, (0, MIN_AUDIO_LEN - len(audio)), mode="constant", constant_values=0.0)
-
-        batch_audio_lens = np.array([min(audio.shape[0], self.max_length)], dtype=np.int32)
-        audios, audio_lens_after_cnn = self.audio_processor(
-            [audio], batch_audio_lens, sampling_rate=16000, return_tensors="pt"
-        )
-        _ = self.forward(audios, audio_lens_after_cnn)
-        torch.cuda.current_stream().synchronize()
-        return
diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py
index 1322168e38..40d20bcd27 100755
--- a/lightllm/server/api_http.py
+++ b/lightllm/server/api_http.py
@@ -41,7 +41,7 @@
 from fastapi.responses import Response, StreamingResponse, JSONResponse
 from lightllm.server.core.objs.sampling_params import SamplingParams
 from lightllm.server.core.objs import StartArgs
-from .multimodal_params import MultimodalParams, warmup_audio_preload
+from .multimodal_params import MultimodalParams
 from .httpserver.manager import HttpServerManager
 from .httpserver_for_pd_master.manager import HttpServerManagerForPDMaster
 from .api_lightllm import lightllm_get_score
@@ -389,11 +389,6 @@ async def startup_event():
     logger.info("server start up")
     loop = asyncio.get_event_loop()
     g_objs.set_args(get_env_start_args())
-    if g_objs.args.enable_multimodal and not g_objs.args.disable_audio:
-        warmup_start = time.time()
-        logger.info("http_audio_preload_warmup_start")
-        await warmup_audio_preload()
-        logger.info(f"http_audio_preload_warmup_done elapsed_ms:{(time.time() - warmup_start) * 1000.0:.3f}")
     loop.create_task(g_objs.httpserver_manager.handle_loop())
     logger.info(f"server start up ok, loop use is {asyncio.get_event_loop()}")
     return
diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py
index 343816e1fd..8a04231508 100644
--- a/lightllm/server/audioserver/model_infer/model_rpc.py
+++ b/lightllm/server/audioserver/model_infer/model_rpc.py
@@ -1,10 +1,6 @@
-import io
 import queue
 import threading
 import time
-import wave
-
-import numpy as np
 import rpyc
 import socket
 import torch
@@ -25,17 +21,6 @@
 logger = init_logger(__name__)
 
 
-def _generate_silence_wav_bytes(sample_rate: int = 16000, num_samples: int = 16000) -> bytes:
-    samples = np.zeros(num_samples, dtype=np.int16)
-    buffer = io.BytesIO()
-    with wave.open(buffer, "wb") as wav_file:
-        wav_file.setnchannels(1)
-        wav_file.setsampwidth(2)
-        wav_file.setframerate(sample_rate)
-        wav_file.writeframes(samples.tobytes())
-    return buffer.getvalue()
-
-
 class AudioModelRpcServer(rpyc.Service):
     def exposed_init_model(self, kvargs):
         kvargs = obtain(kvargs)
@@ -74,7 +59,6 @@ def exposed_init_model(self, kvargs):
                 create_meta_data=False,
                 init_shm_data=False,
             )
-            self._auto_warmup_model()
             self._init_taskes()
         except Exception as e:
             print("#" * 16)
@@ -87,20 +71,6 @@ def exposed_init_model(self, kvargs):
         set_random_seed(2147483647)
         return
 
-    def _auto_warmup_model(self):
-        if not hasattr(self.model, "warmup"):
-            return
-        try:
-            torch.cuda.set_device(self.device_id)
-            warmup_audio = _generate_silence_wav_bytes()
-            self.model.warmup(warmup_audio)
-            logger.info(
-                f"audio model auto warmup finished on dp_rank_id:{self.dp_rank_id} tp_rank_id:{self.tp_rank_id}"
-            )
-        except Exception as e:
-            logger.exception(f"audio model auto warmup failed: {e}")
-            raise
-
     def exposed_run_task(self, audios: List[AudioItem], ref_event_list: List[threading.Event]):
         try:
             audios = obtain(audios)
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 5847975878..79ef2fe028 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -1,7 +1,6 @@
 """Multimodal parameters for text generation."""
 import asyncio
 import os
-import wave
 import time
 import librosa
 import base64
@@ -25,17 +24,6 @@
 DEFAULT_MIN_AUDIO_LEN = 480
 
 
-def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float = 1.0) -> bytes:
-    num_samples = max(1, int(sample_rate * duration_seconds))
-    with BytesIO() as buffer:
-        with wave.open(buffer, "wb") as wav_file:
-            wav_file.setnchannels(1)
-            wav_file.setsampwidth(2)
-            wav_file.setframerate(sample_rate)
-            wav_file.writeframes(b"\x00\x00" * num_samples)
-        return buffer.getvalue()
-
-
 def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray:
     audio_shm_format = extra_params.get("audio_shm_format", RAW_AUDIO_SHM_FORMAT)
     if audio_shm_format == WAVEFORM_F32_SHM_FORMAT:
@@ -273,13 +261,3 @@ def to_origin_dict(self):
         ret["images"] = [i.to_origin_dict() for i in self.images]
         ret["audios"] = [a.to_origin_dict() for a in self.audios]
         return ret
-
-
-async def warmup_audio_preload():
-    warmup_audio = AudioItem(
-        type="base64",
-        data=base64.b64encode(generate_silence_wav_bytes()).decode("utf-8"),
-    )
-    await warmup_audio.preload(None)
-    warmup_audio.read()
-    return

From fe39faa1b994802083fa7acd3539a5371eebcbad Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 14:21:01 +0000
Subject: [PATCH 31/51] delete audio_preload_config

---
 .../common/basemodel/multimodal_tokenizer.py  |  1 -
 lightllm/models/internvl/model.py             |  5 --
 .../models/qwen3_omni_moe_thinker/model.py    |  5 --
 .../qwen3_omni_audio.py                       | 26 +++----
 lightllm/server/api_http.py                   |  4 +-
 .../audioserver/model_infer/model_rpc.py      |  1 -
 lightllm/server/httpserver/manager.py         |  4 +-
 lightllm/server/multimodal_params.py          | 76 +++----------------
 8 files changed, 24 insertions(+), 98 deletions(-)

diff --git a/lightllm/common/basemodel/multimodal_tokenizer.py b/lightllm/common/basemodel/multimodal_tokenizer.py
index 872a418bf7..cdcbd7f089 100644
--- a/lightllm/common/basemodel/multimodal_tokenizer.py
+++ b/lightllm/common/basemodel/multimodal_tokenizer.py
@@ -33,7 +33,6 @@
 class BaseMultiModalTokenizer(ABC):
     def __init__(self, tokenizer, **kwargs):
         self.tokenizer = tokenizer
-        self.audio_preload_config = None
 
     def __getattr__(self, name):
         obj_dict = object.__getattribute__(self, "__dict__")
diff --git a/lightllm/models/internvl/model.py b/lightllm/models/internvl/model.py
index 70c797aeb8..ccb76d3512 100644
--- a/lightllm/models/internvl/model.py
+++ b/lightllm/models/internvl/model.py
@@ -50,11 +50,6 @@ def __init__(self, tokenizer, model_cfg, **kwargs):
 
         self.audio_min_length = MIN_AUDIO_LEN
         self.audio_max_length = 16000 * 30
-        self.audio_preload_config = {
-            "sampling_rate": 16000,
-            "hop_length": 160,
-            "min_audio_len": int(self.audio_min_length),
-        }
 
     def init_imageitem_extral_params(
         self, img: ImageItem, multi_params: MultimodalParams, sampling_params: SamplingParams
diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py
index 6ae73fd1d1..79ce939714 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/model.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/model.py
@@ -45,11 +45,6 @@ def __init__(self, tokenizer=None, processor=None, **kwargs):
         self.sampling_rate = self.audio_processor.sampling_rate
         self.n_samples = self.audio_processor.n_samples
         self.hop_length = self.audio_processor.hop_length
-        self.audio_preload_config = {
-            "sampling_rate": int(self.sampling_rate),
-            "hop_length": int(self.hop_length),
-            "min_audio_len": int(MIN_AUDIO_LEN),
-        }
 
         self.image_start_id = kwargs["model_cfg"]["vision_start_token_id"]
         self.image_end_id = kwargs["model_cfg"]["vision_end_token_id"]
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index 7d525915af..71fdb3f3b1 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -9,7 +9,7 @@
 from typing import Callable, Optional, Union, List
 from transformers.activations import ACT2FN
 
-from lightllm.server.multimodal_params import AudioItem, WAVEFORM_F32_SHM_FORMAT, load_audio_from_shm_payload
+from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload
 from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
@@ -338,25 +338,19 @@ def encode(self, audio_items: List[AudioItem]):
                 items.append(item)
                 audio_data = read_shm(get_shm_name_data(item.uuid))
                 audio = load_audio_from_shm_payload(audio_data, item.extra_params, self.processor.sampling_rate)
-                audio_num_frames = item.extra_params.get("audio_num_frames")
             else:
                 raise ValueError(f"cannot read audio which type is {type(item)}!")
 
-            if audio_num_frames is not None and item.extra_params.get("audio_shm_format") == WAVEFORM_F32_SHM_FORMAT:
-                input_features, feature_lens = self.processor._preprocess_single_padded(
-                    audio, int(audio_num_frames), device="cpu"
-                )
+            input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
+            if feature_attention_mask is not None:
+                audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
+                input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
             else:
-                input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
-                if feature_attention_mask is not None:
-                    audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
-                    input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
-                else:
-                    audio_feature_lengths = None
-
-                feature_lens = (
-                    audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
-                )
+                audio_feature_lengths = None
+
+            feature_lens = (
+                audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
+            )
 
             audio_features = self.forward(
                 input_features,
diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py
index 40d20bcd27..50d992bf9c 100755
--- a/lightllm/server/api_http.py
+++ b/lightllm/server/api_http.py
@@ -300,9 +300,7 @@ async def tokens(request: Request):
 
         multimodal_params_dict = request_dict.get("multimodal_params", {})
         multimodal_params = MultimodalParams(**multimodal_params_dict)
-        await multimodal_params.verify_and_preload(
-            request, audio_preload_config=getattr(g_objs.httpserver_manager.tokenizer, "audio_preload_config", None)
-        )
+        await multimodal_params.verify_and_preload(request)
         return JSONResponse(
             {
                 "ntokens": g_objs.httpserver_manager.tokens(
diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py
index 8a04231508..39a7e06ac3 100644
--- a/lightllm/server/audioserver/model_infer/model_rpc.py
+++ b/lightllm/server/audioserver/model_infer/model_rpc.py
@@ -8,7 +8,6 @@
 from typing import List
 from transformers.configuration_utils import PretrainedConfig
 from rpyc.utils.classic import obtain
-
 from lightllm.models.whisper.whisper_audio import WhisperAudioModel
 from lightllm.models.qwen3_omni_moe_thinker.qwen3_omni_audio import Qwen3OmniMoeAudioEncoder
 from lightllm.server.multimodal_params import AudioItem
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index e2a0dbc4b6..acfe04850f 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -311,9 +311,7 @@ async def generate(
                 original_multimodal_params = copy.deepcopy(multimodal_params)
 
             if self.pd_mode.is_P_or_NORMAL():
-                await multimodal_params.verify_and_preload(
-                    request, audio_preload_config=getattr(self.tokenizer, "audio_preload_config", None)
-                )
+                await multimodal_params.verify_and_preload(request)
                 self._log_stage_timing(
                     group_request_id,
                     start_time,
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 79ef2fe028..f103e54ce5 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -16,28 +16,14 @@
 
 
 logger = init_logger(__name__)
-RAW_AUDIO_SHM_FORMAT = "raw_audio_bytes"
-WAVEFORM_F32_SHM_FORMAT = "waveform_f32"
-AUDIO_SHM_USE_RAW_ENV = "LIGHTLLM_AUDIO_SHM_USE_RAW"
 DEFAULT_AUDIO_SAMPLE_RATE = 16000
-DEFAULT_AUDIO_HOP_LENGTH = 160
-DEFAULT_MIN_AUDIO_LEN = 480
 
 
 def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray:
-    audio_shm_format = extra_params.get("audio_shm_format", RAW_AUDIO_SHM_FORMAT)
-    if audio_shm_format == WAVEFORM_F32_SHM_FORMAT:
-        num_samples = int(extra_params.get("audio_num_samples", 0))
-        if num_samples > 0:
-            return np.frombuffer(audio_data, dtype=np.float32, count=num_samples)
-        return np.frombuffer(audio_data, dtype=np.float32)
-
-    audio, _ = librosa.load(BytesIO(audio_data), sr=sample_rate)
-    return np.asarray(audio, dtype=np.float32)
-
-
-def should_use_raw_audio_shm() -> bool:
-    return os.getenv(AUDIO_SHM_USE_RAW_ENV, "0") == "1"
+    num_samples = int(extra_params.get("audio_num_samples", 0))
+    if num_samples > 0:
+        return np.frombuffer(audio_data, dtype=np.float32, count=num_samples)
+    return np.frombuffer(audio_data, dtype=np.float32)
 
 
 class AudioItem:
@@ -60,11 +46,8 @@ def __init__(self, **kwargs):
         self._preload_data = None
         self.extra_params = {}
 
-    async def preload(self, request: Request, audio_preload_config: dict = None):
+    async def preload(self, request: Request):
         try:
-            req_id = getattr(getattr(request, "state", None), "lightllm_req_id", None)
-            preload_start = time.time()
-            source_ready_start = preload_start
             if self._type == "url":
                 timeout = int(os.getenv("REQUEST_TIMEOUT", "5"))
                 proxy = os.getenv("REQUEST_PROXY", None)
@@ -73,51 +56,18 @@ async def preload(self, request: Request, audio_preload_config: dict = None):
                 audio_data = base64.b64decode(self._data)
             else:
                 raise ValueError(f"cannot read audio which type is {self._type}!")
-            source_ready_cost_ms = (time.time() - source_ready_start) * 1000.0
-
-            audio_preload_config = audio_preload_config or {}
-            target_sample_rate = int(audio_preload_config.get("sampling_rate", DEFAULT_AUDIO_SAMPLE_RATE))
-            hop_length = int(audio_preload_config.get("hop_length", DEFAULT_AUDIO_HOP_LENGTH))
-            min_audio_len = int(audio_preload_config.get("min_audio_len", DEFAULT_MIN_AUDIO_LEN))
 
             # check if valid audio bytes
-            decode_start = time.time()
-            audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=target_sample_rate)
+            audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=DEFAULT_AUDIO_SAMPLE_RATE)
             audio_values = np.asarray(audio_values, dtype=np.float32)
-            decode_cost_ms = (time.time() - decode_start) * 1000.0
-            effective_audio_len = max(audio_values.shape[0], min_audio_len)
-            padded_audio_len = ((effective_audio_len + hop_length - 1) // hop_length) * hop_length
-            if padded_audio_len > audio_values.shape[0]:
-                audio_values = np.pad(
-                    audio_values,
-                    (0, padded_audio_len - audio_values.shape[0]),
-                    mode="constant",
-                    constant_values=0.0,
-                )
+            from lightllm.models.whisper.defaults import MIN_AUDIO_LEN
 
-            self.audio_length = effective_audio_len
-            if should_use_raw_audio_shm():
-                self.extra_params["audio_shm_format"] = RAW_AUDIO_SHM_FORMAT
-                self.extra_params.pop("audio_sample_rate", None)
-                self.extra_params.pop("audio_num_samples", None)
-                self.extra_params.pop("audio_num_frames", None)
-                self._preload_data = audio_data
-            else:
-                self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT
-                self.extra_params["audio_sample_rate"] = target_sample_rate
-                self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
-                self.extra_params["audio_num_frames"] = int(effective_audio_len // hop_length)
-                self._preload_data = audio_values.tobytes()
+            self.audio_length = max(int(audio_values.shape[0]), MIN_AUDIO_LEN)
+            self._preload_data = audio_values.tobytes()
+            self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
             self.extra_params["audio_payload_md5"] = (
                 hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params)))
             )
-            logger.info(
-                f"lightllm_req_id:{req_id} stage:audio_preload_done "
-                f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} "
-                f"source_type:{self._type} source_ready_ms:{source_ready_cost_ms:.3f} "
-                f"decode_ms:{decode_cost_ms:.3f} audio_length:{self.audio_length} "
-                f"shm_format:{self.extra_params['audio_shm_format']}"
-            )
             return
 
         except Exception as e:
@@ -238,11 +188,9 @@ def __init__(
         self.audios = [AudioItem(**a) for a in audios]
         return
 
-    async def verify_and_preload(self, request: Request, audio_preload_config: dict = None):
+    async def verify_and_preload(self, request: Request):
         preload_coroutines = [image.preload(request) for image in self.images]
-        preload_coroutines.extend(
-            audio.preload(request, audio_preload_config=audio_preload_config) for audio in self.audios
-        )
+        preload_coroutines.extend(audio.preload(request) for audio in self.audios)
         if preload_coroutines:
             await asyncio.gather(*preload_coroutines)
         return

From f1c9f0770a5e8452fbffe62c694f0ccfdbbf7d4c Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 14:24:16 +0000
Subject: [PATCH 32/51] delete _preprocess_single_padded

---
 .../qwen3_omni_moe_thinker/audio_process.py   | 23 -------------------
 1 file changed, 23 deletions(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
index 42eae8edb5..e9dc931886 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
@@ -105,29 +105,6 @@ def zero_mean_unit_var_norm(
 
         return normed_input_values
 
-    def _preprocess_single_padded(
-        self,
-        raw_speech: np.ndarray,
-        num_frames: int,
-        device: Optional[str] = "cpu",
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        waveform = np.asarray(raw_speech, dtype=np.float32)
-        if waveform.ndim != 1:
-            raise ValueError(f"single audio fast path expects 1D waveform, got shape={waveform.shape}")
-
-        extracted = self._torch_extract_fbank_features(waveform[None, :], device)
-        extracted = np.asarray(extracted, dtype=np.float32)
-        if extracted.ndim != 3:
-            raise ValueError(f"unexpected extracted feature shape={extracted.shape}")
-
-        if extracted.shape[-1] < num_frames:
-            raise ValueError(f"feature frames {extracted.shape[-1]} < requested num_frames {num_frames}")
-
-        compact_features = torch.from_numpy(extracted[:, :, :num_frames]).to(device="cuda", dtype=torch.bfloat16)
-        compact_features = compact_features[0].contiguous()
-        feature_lens = torch.tensor([num_frames], device="cuda", dtype=torch.long)
-        return compact_features, feature_lens
-
     def _preprocess(
         self,
         raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],

From 9bee105b4a7e89e27ac11f783872793dcb643ed8 Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 8 Apr 2026 01:58:11 +0000
Subject: [PATCH 33/51] fix

---
 lightllm/server/multimodal_params.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index f103e54ce5..2e8ed701e4 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -189,10 +189,11 @@ def __init__(
         return
 
     async def verify_and_preload(self, request: Request):
-        preload_coroutines = [image.preload(request) for image in self.images]
-        preload_coroutines.extend(audio.preload(request) for audio in self.audios)
-        if preload_coroutines:
-            await asyncio.gather(*preload_coroutines)
+        tasks = [image.preload(request) for image in self.images]
+        tasks += [audio.preload(request) for audio in self.audios]
+
+        if tasks:
+            await asyncio.gather(*tasks)
         return
 
     def to_dict(self):

From 6c9c49067cd6d1480685ae2636637b9aefe56cd2 Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 8 Apr 2026 02:51:45 +0000
Subject: [PATCH 34/51] fix

---
 lightllm/server/httpserver/manager.py | 82 +++++++++++++++------------
 1 file changed, 45 insertions(+), 37 deletions(-)

diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index acfe04850f..d5dcd37825 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -133,33 +133,48 @@ def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str
         return
 
     async def _alloc_resource(self, items, md5sums, token_nums, datas):
-        while True:
-            records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
-
-            if records is None:
-                await asyncio.sleep(0.1)
-                continue
-
-            if isinstance(records, str) and "error" in records:
-                logger.error(str(records) + "and try to set --embed_cache_storage_size bigger")
-                raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger")
-
-            update_data_ids = []
-            for item, rec, data in zip(items, records, datas):
-                item: Union[ImageItem, AudioItem] = item
-                item.uuid = rec["id"]
-                item.token_id = rec["token_id"]
-                item.token_num = rec["token_num"]
-                item.start_index_in_embed_cache = rec["start_index_in_embed_cache"]
-
-                if not rec["data_ready"]:
-                    create_shm(get_shm_name_data(rec["id"]), data)
-                    update_data_ids.append(rec["id"])
-
-            if update_data_ids:
-                self.cache_client.root.set_items_data(update_data_ids)
+        if len(items) == 0:
             return
 
+        for _ in range(1000):
+            # 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity，从而造成死锁的问题。
+            # 如果不加任何锁，假如请求1和请求2都有6张图片，而cache_capacity为10，
+            # 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图，将会资源竞争产生死锁。
+            async with self._resource_lock:
+                records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
+                if records is not None:
+                    break
+                await asyncio.sleep(0.01)
+
+        # 长时间无法申请到足够资源的时候，则开始进行阻塞式尝试，防止其他请求一起申请相关资源。
+        if records is None:
+            async with self._resource_lock:
+                while records is None:
+                    records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
+                    if records is not None:
+                        break
+                    await asyncio.sleep(0.1)
+
+        if isinstance(records, str) and "error" in records:
+            logger.error(str(records) + "and try to set --embed_cache_storage_size bigger")
+            raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger")
+
+        update_data_ids = []
+        for item, rec, data in zip(items, records, datas):
+            item: Union[ImageItem, AudioItem] = item
+            item.uuid = rec["id"]
+            item.token_id = rec["token_id"]
+            item.token_num = rec["token_num"]
+            item.start_index_in_embed_cache = rec["start_index_in_embed_cache"]
+
+            if not rec["data_ready"]:
+                create_shm(get_shm_name_data(rec["id"]), data)
+                update_data_ids.append(rec["id"])
+
+        if update_data_ids:
+            self.cache_client.root.set_items_data(update_data_ids)
+        return
+
     async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams):
         # 只有 P 和 NORMAL 节点需要真的管理多模态资源
         if self.pd_mode.is_P_or_NORMAL():
@@ -167,10 +182,11 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
             for img in multimodal_params.images:
                 self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
                 data = img.read()
+                # must after init_imageitem_extral_params
                 token_num = self.tokenizer.get_image_token_length(img)
                 md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
-                img.md5 = md5sum
                 md5sums.append(md5sum)
+                img.md5 = md5sum
                 tokens_nums.append(token_num)
                 datas.append(data)
                 items.append(img)
@@ -178,22 +194,14 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
                 self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
                 data = audio.read()
                 token_num = self.tokenizer.get_audio_token_length(audio)
-                payload_md5 = audio.extra_params.get("audio_payload_md5")
-                md5sum = payload_md5
-                audio.md5 = md5sum
+                md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(audio.extra_params)))
                 md5sums.append(md5sum)
+                audio.md5 = md5sum
                 tokens_nums.append(token_num)
                 datas.append(data)
                 items.append(audio)
 
-            if len(items) <= 1:
-                await self._alloc_resource(items, md5sums, tokens_nums, datas)
-                return
-            # 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity，从而造成死锁的问题。
-            # 如果不加任何锁，假如请求1和请求2都有6张图片，而cache_capacity为10，
-            # 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图，将会资源竞争产生死锁。
-            async with self._resource_lock:
-                await self._alloc_resource(items, md5sums, tokens_nums, datas)
+            await self._alloc_resource(items, md5sums, tokens_nums, datas)
         return
 
     async def _release_multimodal_resources(self, multimodal_params: MultimodalParams):

From 3b057d0b6c450f167c6f2534e75d74a1c5801f0c Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 8 Apr 2026 02:55:00 +0000
Subject: [PATCH 35/51] fix

---
 lightllm/server/httpserver/manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index d5dcd37825..115be4bd38 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -136,7 +136,7 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas):
         if len(items) == 0:
             return
 
-        for _ in range(1000):
+        for _ in range(2000):
             # 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity，从而造成死锁的问题。
             # 如果不加任何锁，假如请求1和请求2都有6张图片，而cache_capacity为10，
             # 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图，将会资源竞争产生死锁。
@@ -144,7 +144,7 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas):
                 records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
                 if records is not None:
                     break
-                await asyncio.sleep(0.01)
+                await asyncio.sleep(0.005)
 
         # 长时间无法申请到足够资源的时候，则开始进行阻塞式尝试，防止其他请求一起申请相关资源。
         if records is None:

From a8a8130932a90e1a51c4f94665357ed6127005a3 Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 8 Apr 2026 03:03:35 +0000
Subject: [PATCH 36/51] fix

---
 lightllm/server/multimodal_params.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 2e8ed701e4..e45a28db12 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -1,10 +1,8 @@
 """Multimodal parameters for text generation."""
 import asyncio
 import os
-import time
 import librosa
 import base64
-import hashlib
 import numpy as np
 from typing import List
 from io import BytesIO
@@ -12,11 +10,9 @@
 from fastapi import Request
 from lightllm.utils.multimodal_utils import fetch_resource
 from lightllm.utils.log_utils import init_logger
-from frozendict import frozendict
 
 
 logger = init_logger(__name__)
-DEFAULT_AUDIO_SAMPLE_RATE = 16000
 
 
 def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray:
@@ -58,16 +54,19 @@ async def preload(self, request: Request):
                 raise ValueError(f"cannot read audio which type is {self._type}!")
 
             # check if valid audio bytes
-            audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=DEFAULT_AUDIO_SAMPLE_RATE)
+            audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=16000)
             audio_values = np.asarray(audio_values, dtype=np.float32)
+
             from lightllm.models.whisper.defaults import MIN_AUDIO_LEN
 
-            self.audio_length = max(int(audio_values.shape[0]), MIN_AUDIO_LEN)
+            if audio_values.shape[0] < MIN_AUDIO_LEN:
+                audio_values = np.pad(
+                    audio_values, (0, MIN_AUDIO_LEN - audio_values.shape[0]), mode="constant", constant_values=0.0
+                )
+                logger.warning(f"audio length is too short, pad to {MIN_AUDIO_LEN}")
+
+            self.audio_length = int(audio_values.shape[0])
             self._preload_data = audio_values.tobytes()
-            self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
-            self.extra_params["audio_payload_md5"] = (
-                hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params)))
-            )
             return
 
         except Exception as e:

From 4479a6599423cf3a442cb0a937ad89ab07dac8bc Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 8 Apr 2026 03:23:01 +0000
Subject: [PATCH 37/51] fix

---
 .../qwen3_omni_moe_thinker/audio_process.py   | 28 ++++++++-----------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
index e9dc931886..194914d455 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
@@ -5,6 +5,7 @@
 from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.utils import TensorType
+from functools import lru_cache
 
 
 class WhisperFeatureExtractor(SequenceFeatureExtractor):
@@ -46,32 +47,25 @@ def __init__(
             norm="slaney",
             mel_scale="slaney",
         )
-        self._hann_window_cache = {}
-        self._mel_filters_cache = {}
-
-    def _get_cached_feature_tensors(self, device: Union[str, torch.device]):
-        device_key = str(device)
-        window = self._hann_window_cache.get(device_key)
-        if window is None:
-            window = torch.hann_window(self.n_fft, device=device)
-            self._hann_window_cache[device_key] = window
-
-        mel_filters = self._mel_filters_cache.get(device_key)
-        if mel_filters is None:
-            mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
-            self._mel_filters_cache[device_key] = mel_filters
-        return window, mel_filters
+
+    @lru_cache(maxsize=12)
+    def get_hann_window(self, device: Union[str, torch.device]):
+        return torch.hann_window(self.n_fft, device=device)
+
+    @lru_cache(maxsize=12)
+    def get_mel_filters(self, device: Union[str, torch.device]):
+        return torch.from_numpy(self.mel_filters).to(device, torch.float32)
 
     def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu") -> np.ndarray:
         waveform = torch.from_numpy(waveform).to(device, torch.float32)
-        window, mel_filters = self._get_cached_feature_tensors(device)
+        window = self.get_hann_window(device)
 
         if self.dither != 0.0:
             waveform += self.dither * torch.randn(waveform.shape, dtype=waveform.dtype, device=waveform.device)
 
         stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
         magnitudes = stft[..., :-1].abs() ** 2
-
+        mel_filters = self.get_mel_filters(device)
         mel_spec = mel_filters.T @ magnitudes
 
         log_spec = torch.clamp(mel_spec, min=1e-10).log10()

From be595131895792f57532be06a5988923935fae20 Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 8 Apr 2026 05:03:39 +0000
Subject: [PATCH 38/51] fix

---
 lightllm/models/qwen3_omni_moe_thinker/model.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py
index 79ce939714..1b8fa0110d 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/model.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/model.py
@@ -1,7 +1,8 @@
 import os
 import json
 import librosa
-from collections import OrderedDict
+import copy
+from functools import lru_cache
 from io import BytesIO
 from lightllm.common.build_utils import repair_config
 from lightllm.models.registry import ModelRegistry
@@ -31,8 +32,6 @@
 class QWen3OmniTokenizer(QWen3VLTokenizer):
     def __init__(self, tokenizer=None, processor=None, **kwargs):
         self.tokenizer = tokenizer
-        self._prompt_encode_cache = OrderedDict()
-        self._prompt_encode_cache_capacity = 64
         # image
         self.image_processor = processor.image_processor
         self.min_pixel = self.image_processor.min_pixels
@@ -69,16 +68,9 @@ def get_audio_token_length(self, audio: AudioItem):
         # print(f"token_num is {token_num}  n_samples is {self.n_samples} hop_length is {self.hop_length}")
         return token_num
 
+    @lru_cache(maxsize=128)
     def _encode_prompt_text(self, prompt: str):
-        cached_ids = self._prompt_encode_cache.get(prompt)
-        if cached_ids is not None:
-            self._prompt_encode_cache.move_to_end(prompt)
-            return list(cached_ids)
-
         origin_ids = self.tokenizer.encode(prompt)
-        self._prompt_encode_cache[prompt] = tuple(origin_ids)
-        if len(self._prompt_encode_cache) > self._prompt_encode_cache_capacity:
-            self._prompt_encode_cache.popitem(last=False)
         return origin_ids
 
     def _caclu_audio_token_num(self, input_audio_len: int):
@@ -90,6 +82,7 @@ def _caclu_audio_token_num(self, input_audio_len: int):
 
     def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs):
         origin_ids = self._encode_prompt_text(prompt)
+        origin_ids = copy.deepcopy(origin_ids)
 
         # <img><image_pad></img> -> <img></img>
         origin_ids = [token for token in origin_ids if token not in (self.image_token_id, self.audio_token_id)]

From 3b0e61353c5eb5017c57fa37c49910e868b8b39e Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 8 Apr 2026 05:19:13 +0000
Subject: [PATCH 39/51] fix

---
 lightllm/server/multimodal_params.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index e45a28db12..6de86fd8b5 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -8,6 +8,7 @@
 from io import BytesIO
 from PIL import Image
 from fastapi import Request
+from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
 from lightllm.utils.multimodal_utils import fetch_resource
 from lightllm.utils.log_utils import init_logger
 
@@ -15,13 +16,6 @@
 logger = init_logger(__name__)
 
 
-def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray:
-    num_samples = int(extra_params.get("audio_num_samples", 0))
-    if num_samples > 0:
-        return np.frombuffer(audio_data, dtype=np.float32, count=num_samples)
-    return np.frombuffer(audio_data, dtype=np.float32)
-
-
 class AudioItem:
     def __init__(self, **kwargs):
         self._type = kwargs["type"]
@@ -97,6 +91,12 @@ def to_origin_dict(self):
         ret["data"] = self._data
         return ret
 
+    def load_audio_from_shm_payload(self) -> np.ndarray:
+        audio_data = read_shm(get_shm_name_data(self.uuid))
+        audio_array = np.frombuffer(audio_data, dtype=np.float32)
+        assert audio_array.shape[0] == self.audio_length
+        return audio_array
+
 
 class ImageItem:
     def __init__(self, **kwargs):

From 56af31d4a1354ef29e434355471540da2a95dc5d Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 8 Apr 2026 05:21:17 +0000
Subject: [PATCH 40/51] fix

---
 lightllm/server/multimodal_params.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 6de86fd8b5..6210628751 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -94,7 +94,9 @@ def to_origin_dict(self):
     def load_audio_from_shm_payload(self) -> np.ndarray:
         audio_data = read_shm(get_shm_name_data(self.uuid))
         audio_array = np.frombuffer(audio_data, dtype=np.float32)
-        assert audio_array.shape[0] == self.audio_length
+        if audio_array.shape[0] != self.audio_length:
+            logger.error(f"audio length is not match, {audio_array.shape[0]} != {self.audio_length}")
+            assert audio_array.shape[0] == self.audio_length
         return audio_array
 
 

From 4a61198fabbd1e2e116905e5a1333f0b4b9e13ba Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 8 Apr 2026 05:26:15 +0000
Subject: [PATCH 41/51] fix

---
 lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index 71fdb3f3b1..03c57126ff 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -9,8 +9,7 @@
 from typing import Callable, Optional, Union, List
 from transformers.activations import ACT2FN
 
-from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload
-from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
+from lightllm.server.multimodal_params import AudioItem
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
 from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor
@@ -336,8 +335,8 @@ def encode(self, audio_items: List[AudioItem]):
             if isinstance(item, AudioItem):
                 uuids.append(item.uuid)
                 items.append(item)
-                audio_data = read_shm(get_shm_name_data(item.uuid))
-                audio = load_audio_from_shm_payload(audio_data, item.extra_params, self.processor.sampling_rate)
+                assert self.processor.sampling_rate == 16000
+                audio = item.load_audio_from_shm_payload()
             else:
                 raise ValueError(f"cannot read audio which type is {type(item)}!")
 

From ccd4b573e70b8fbbe3af0afffb3cf67caa4c66c1 Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 8 Apr 2026 05:30:02 +0000
Subject: [PATCH 42/51] fix

---
 lightllm/models/whisper/whisper_audio.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
index 4cd9619e55..aaa29e1c71 100644
--- a/lightllm/models/whisper/whisper_audio.py
+++ b/lightllm/models/whisper/whisper_audio.py
@@ -6,8 +6,7 @@
 from typing import List, Union
 from safetensors.torch import load_file
 from transformers.processing_utils import ProcessorMixin
-from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
-from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload
+from lightllm.server.multimodal_params import AudioItem
 
 
 # tokenizer_class removed
@@ -37,7 +36,7 @@ def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
         return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
 
     def get_T_after_cnn(self, L_in, dilation=1):
-        for padding, kernel_size, stride in eval("[(1,3,1)] + [(1,3,2)] "):
+        for (padding, kernel_size, stride) in eval("[(1,3,1)] + [(1,3,2)] "):
             L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
             L_out = 1 + L_out // stride
             L_in = L_out
@@ -168,8 +167,7 @@ def encode(self, audio_items: List[AudioItem]):
             if isinstance(item, AudioItem):
                 uuids.append(item.uuid)
                 items.append(item)
-                audio_data = read_shm(get_shm_name_data(item.uuid))
-                audio = load_audio_from_shm_payload(audio_data, item.extra_params, 16000)
+                audio = item.load_audio_from_shm_payload()
             else:
                 raise ValueError(f"cannot read audio which type is {type(item)}!")
 
@@ -217,7 +215,9 @@ def encode(self, audio_items: List[AudioItem]):
 
         ans_embeds = []
         for i in range(len(uuids)):
+
             item = items[i]
+
             # 拼接该 audio 的所有 chunk embedding
             cur_embed = torch.cat(per_audio_embeds[i], dim=0)
             ans_embeds.append(cur_embed)

From b7d11876a659fb0e12c5886f8db38c4229f50b76 Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 8 Apr 2026 05:49:05 +0000
Subject: [PATCH 43/51] fix

---
 lightllm/server/httpserver/manager.py | 27 +++------------------------
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 115be4bd38..07d5936890 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -618,30 +618,16 @@ async def transfer_to_next_module(
                 return
 
             if not self.args.disable_audio:
-                logger.debug(
-                    f"lightllm_req_id:{group_req_objs.group_req_id} "
-                    f"stage:transfer_to_audio "
-                    f"target_port:{self.args.audio_port}"
-                )
                 self.send_to_audio.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL)
                 return
 
             if self.args.enable_cpu_cache:
-                logger.debug(
-                    f"lightllm_req_id:{group_req_objs.group_req_id} "
-                    f"stage:transfer_to_multi_level_kv_cache target_port:{self.args.multi_level_kv_cache_port}"
-                )
                 self.send_to_multi_level_kv_cache.send_pyobj(
                     group_req_objs.to_group_req_index(),
                     protocol=pickle.HIGHEST_PROTOCOL,
                 )
                 return
 
-            logger.debug(
-                f"lightllm_req_id:{group_req_objs.group_req_id} "
-                f"stage:transfer_to_router "
-                f"target_port:{self.args.router_port}"
-            )
             self.send_to_router.send_pyobj(
                 group_req_objs.to_group_req_index(),
                 protocol=pickle.HIGHEST_PROTOCOL,
@@ -650,11 +636,6 @@ async def transfer_to_next_module(
 
         if self.pd_mode.is_D():
             # 在 D 模式下，不需要传输真的多模态参数，因为其已经被 P 处理好了
-            logger.debug(
-                f"lightllm_req_id:{group_req_objs.group_req_id} "
-                f"stage:transfer_to_router_from_decode "
-                f"target_port:{self.args.router_port}"
-            )
             self.send_to_router.send_pyobj(
                 group_req_objs.to_group_req_index(),
                 protocol=pickle.HIGHEST_PROTOCOL,
@@ -673,6 +654,7 @@ async def _wait_to_token_package(
         req_status: "ReqStatus",
         request: Request,
     ):
+
         event = req_status.event
         unfinished_count = sampling_params.best_of
         out_token_counter = 0
@@ -715,11 +697,6 @@ async def _wait_to_token_package(
                         first_token_cost_ms = (time.time() - start_time) * 1000
                         is_first_token = False
                         self.first_time_costs.add(first_token_cost_ms)
-                        logger.info(
-                            f"lightllm_req_id:{group_request_id} "
-                            f"stage:first_token_arrived elapsed_ms:{first_token_cost_ms:.3f} "
-                            f"sub_req_id:{sub_req_id} prompt_tokens:{prompt_tokens}"
-                        )
 
                     out_token_counter += 1
 
@@ -803,6 +780,7 @@ async def recycle_resource_loop(self):
         pre_time_mark = time.time()
 
         while True:
+
             try:
                 await asyncio.wait_for(self.recycle_event.wait(), timeout=0.02)
             except asyncio.TimeoutError:
@@ -879,6 +857,7 @@ async def handle_loop(self):
 
                         for _ in range(read_token_count):
                             if not req.out_tokens_queue.is_empty():
+
                                 text, src_index, special, count_output_tokens = req.out_tokens_queue.peek()
                                 req.cumlogprob += float(req.shm_logprobs.arr[src_index])
                                 metadata = {

From 40cd0b9882160db09723fe5357832f42908af619 Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 8 Apr 2026 06:08:28 +0000
Subject: [PATCH 44/51] fix

---
 lightllm/server/httpserver/manager.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 07d5936890..8b7dafeffe 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -301,8 +301,6 @@ async def generate(
         start_time = time.time()
         request_headers = request.headers if request is not None else {}
         group_request_id = self.alloc_req_id(sampling_params, is_health_req)
-        if request is not None:
-            request.state.lightllm_req_id = group_request_id
         audio_count = len(multimodal_params.audios) if multimodal_params is not None else 0
         image_count = len(multimodal_params.images) if multimodal_params is not None else 0
         self._log_stage_timing(

From 284815fb33022f6c5b6fda5679c8e4508dd70c66 Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 8 Apr 2026 06:09:38 +0000
Subject: [PATCH 45/51] fix

---
 lightllm/server/httpserver/manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 8b7dafeffe..45193e928b 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -307,8 +307,8 @@ async def generate(
             group_request_id,
             start_time,
             "received",
-            has_audio=audio_count > 0,
-            has_image=image_count > 0,
+            audio_count=audio_count,
+            image_count=image_count,
         )
 
         try:

From fa11c53cde2dc7d60503c77141718b3d871a1c40 Mon Sep 17 00:00:00 2001
From: wangzaijun <wzjhelloworld@qq.com>
Date: Wed, 8 Apr 2026 06:22:24 +0000
Subject: [PATCH 46/51] fix

---
 lightllm/server/httpserver/manager.py | 31 +++------------------------
 1 file changed, 3 insertions(+), 28 deletions(-)

diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 45193e928b..610931784c 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -322,8 +322,6 @@ async def generate(
                     group_request_id,
                     start_time,
                     "verify_and_preload_done",
-                    audio_count=audio_count,
-                    image_count=image_count,
                 )
 
             # 记录请求到达的相关信息
@@ -334,9 +332,6 @@ async def generate(
                 group_request_id,
                 start_time,
                 "encode_done",
-                prompt_tokens=len(prompt_ids),
-                audio_count=audio_count,
-                image_count=image_count,
             )
 
             prompt_tokens = len(prompt_ids)
@@ -350,8 +345,6 @@ async def generate(
                 group_request_id,
                 start_time,
                 "check_and_repair_length_done",
-                prompt_tokens=len(prompt_ids),
-                max_new_tokens=sampling_params.max_new_tokens,
             )
 
             if nixl_pd_upload_websocket is not None and not is_health_req and self.pd_mode.is_NP():
@@ -404,7 +397,6 @@ async def generate(
                 group_request_id,
                 start_time,
                 "shm_req_init_done",
-                req_count=len(req_objs),
             )
 
             logger.debug(
@@ -423,8 +415,6 @@ async def generate(
                 group_request_id,
                 start_time,
                 "request_forwarded",
-                has_audio=audio_count > 0,
-                has_image=image_count > 0,
             )
 
             results_generator = self._wait_to_token_package(
@@ -481,6 +471,7 @@ def _count_multimodal_tokens(self, multimodal_params: MultimodalParams) -> Tuple
         return image_tokens, audio_tokens
 
     async def _log_req_header(self, request_headers, group_request_id: int):
+
         x_request_id = request_headers.get("X-Request-Id", "")
         x_session_id = request_headers.get("X-Session-Id", "")
 
@@ -493,11 +484,7 @@ async def _log_req_header(self, request_headers, group_request_id: int):
         return
 
     async def _encode(
-        self,
-        prompt: Union[str, List[int]],
-        multimodal_params: MultimodalParams,
-        sampling_params: SamplingParams,
-        start_time: Optional[float] = None,
+        self, prompt: Union[str, List[int]], multimodal_params: MultimodalParams, sampling_params: SamplingParams
     ):
         if isinstance(prompt, str):
             if self.enable_multimodal:
@@ -507,14 +494,6 @@ async def _encode(
                 if multimodal_params.audios:
                     assert not self.args.disable_audio, "audio multimodal not enabled"
                 await self._alloc_multimodal_resources(multimodal_params, sampling_params)
-                log_req_id = getattr(sampling_params, "group_request_id", None)
-                self._log_stage_timing(
-                    log_req_id,
-                    start_time,
-                    "alloc_multimodal_resources_done",
-                    audio_count=len(multimodal_params.audios),
-                    image_count=len(multimodal_params.images),
-                )
                 prompt_ids = self.tokenizer.encode(
                     prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens
                 )
@@ -605,13 +584,9 @@ async def transfer_to_next_module(
         self,
         group_req_objs: Optional[GroupReqObjs] = None,
     ):
+
         if self.pd_mode.is_P_or_NORMAL():
             if not self.args.disable_vision:
-                logger.debug(
-                    f"lightllm_req_id:{group_req_objs.group_req_id} "
-                    f"stage:transfer_to_visual "
-                    f"target_port:{self.args.visual_port}"
-                )
                 self.send_to_visual.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL)
                 return
 

From 44c63d97eb26ac5ef3e946fa6e98acc15ae4fa14 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Wed, 8 Apr 2026 07:34:56 +0000
Subject: [PATCH 47/51] fix

---
 lightllm/server/httpserver/manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 610931784c..c9822ff618 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -327,7 +327,7 @@ async def generate(
             # 记录请求到达的相关信息
             await self._log_req_header(request_headers, group_request_id)
             # encode
-            prompt_ids = await self._encode(prompt, multimodal_params, sampling_params, start_time=start_time)
+            prompt_ids = await self._encode(prompt, multimodal_params, sampling_params)
             self._log_stage_timing(
                 group_request_id,
                 start_time,

From c5cc9952105dd955b04f1ced509ce1294b7227c3 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Wed, 8 Apr 2026 09:09:12 +0000
Subject: [PATCH 48/51] support long audio

---
 lightllm/models/qwen3_omni_moe_thinker/audio_process.py | 2 +-
 lightllm/models/qwen3_omni_moe_thinker/model.py         | 7 +------
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
index 194914d455..58b223d579 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
@@ -102,7 +102,7 @@ def zero_mean_unit_var_norm(
     def _preprocess(
         self,
         raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
-        truncation: bool = True,
+        truncation: bool = False,
         pad_to_multiple_of: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_attention_mask: Optional[bool] = None,
diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py
index 1b8fa0110d..bee15e3d2a 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/model.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/model.py
@@ -59,12 +59,7 @@ def init_audioitem_extral_params(
         return
 
     def get_audio_token_length(self, audio: AudioItem):
-        # 这里得处理对应奖语音长度按照 30 进行限制，后续处理中，超过30的会被截断。
-        if audio.audio_length > self.n_samples:
-            logger.warning(f"audio length {audio.audio_length} exceed max length {self.n_samples}, will be truncated.")
-
-        length = min(audio.audio_length, int(self.n_samples))
-        token_num = self._caclu_audio_token_num(length)
+        token_num = self._caclu_audio_token_num(audio.audio_length)
         # print(f"token_num is {token_num}  n_samples is {self.n_samples} hop_length is {self.hop_length}")
         return token_num
 

From eb4558a906da5c4c8aa9ab9f8e90d18382f70cc1 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 9 Apr 2026 04:45:20 +0000
Subject: [PATCH 49/51] add check_long_audio_infer

---
 .../qwen3_omni_audio.py                       | 23 +++++++++++++++++++
 lightllm/models/whisper/whisper_audio.py      |  3 +++
 .../audioserver/model_infer/model_rpc.py      |  1 +
 3 files changed, 27 insertions(+)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index 03c57126ff..9fb4e1d1db 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -10,10 +10,13 @@
 from transformers.activations import ACT2FN
 
 from lightllm.server.multimodal_params import AudioItem
+from lightllm.utils.log_utils import init_logger
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
 from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor
 
+logger = init_logger(__name__)
+
 
 def _get_feat_extract_output_lengths(input_lengths):
     """
@@ -259,6 +262,7 @@ def load_model(self, weight_dir, config):
 
         self.load_state_dict(weight_dict)
 
+    @torch.inference_mode()
     def forward(
         self,
         input_features,
@@ -327,6 +331,7 @@ def forward(
         hidden_states = self.proj2(hidden_states)
         return hidden_states
 
+    @torch.inference_mode()
     def encode(self, audio_items: List[AudioItem]):
         uuids = []
         items: List[AudioItem] = []
@@ -363,3 +368,21 @@ def encode(self, audio_items: List[AudioItem]):
             all_embeds.append(cur_embed)
 
         return all_embeds, audio_items
+
+    @torch.inference_mode()
+    def check_long_audio_infer(self):
+        """Exercise forward with mel length chosen so the conv loop runs once with batch dim == conv_chunksize."""
+        device = next(self.parameters()).device
+        frame_len = self.conv_chunksize * (self.n_window * 2)
+        logger.info(
+            "check_long_audio_infer: start frame_len=%s conv_chunksize=%s n_window=%s device=%s dtype=%s",
+            frame_len,
+            self.conv_chunksize,
+            self.n_window,
+            device,
+            self.data_type,
+        )
+        input_features = torch.zeros(self.num_mel_bins, frame_len, device=device, dtype=self.data_type)
+        feature_lens = torch.tensor([frame_len], device=device, dtype=torch.long)
+        out = self.forward(input_features, feature_lens=feature_lens)
+        logger.info("check_long_audio_infer: done output_shape=%s", tuple(out.shape))
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
index aaa29e1c71..8a984d29a5 100644
--- a/lightllm/models/whisper/whisper_audio.py
+++ b/lightllm/models/whisper/whisper_audio.py
@@ -223,3 +223,6 @@ def encode(self, audio_items: List[AudioItem]):
             ans_embeds.append(cur_embed)
 
         return ans_embeds, audio_items
+
+    def check_long_audio_infer(self):
+        pass
diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py
index 39a7e06ac3..82919856d9 100644
--- a/lightllm/server/audioserver/model_infer/model_rpc.py
+++ b/lightllm/server/audioserver/model_infer/model_rpc.py
@@ -51,6 +51,7 @@ def exposed_init_model(self, kvargs):
 
             self.model.load_model(weight_dir, model_cfg)
             self.model = self.model.cuda()
+            self.model.check_long_audio_infer()
 
             self.cache_client = rpyc.connect("localhost", self.cache_port, config={"allow_pickle": True})
             self.cache_client._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)

From 94ffee1fab412071a93dfc69abaeeb3d0fd43356 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 9 Apr 2026 07:05:03 +0000
Subject: [PATCH 50/51] add LIGHTLLM_QWEN3_OMNI_CONV_CHUNKSIZE

---
 lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index 9fb4e1d1db..c81e1d5859 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -15,6 +15,8 @@
 from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
 from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor
 
+QWEN3_OMNI_CONV_CHUNKSIZE = int(os.getenv("LIGHTLLM_QWEN3_OMNI_CONV_CHUNKSIZE", 500))
+
 logger = init_logger(__name__)
 
 
@@ -159,7 +161,7 @@ def __init__(
         activation_function="gelu",
         output_dim=2048,
         n_window_infer=800,
-        conv_chunksize=500,
+        conv_chunksize=QWEN3_OMNI_CONV_CHUNKSIZE,
         encoder_attention_heads=20,
         attention_dropout=0,
         activation_dropout=0,

From 0553276514fff3eef059306b917c8a9f6084dced Mon Sep 17 00:00:00 2001
From: wanzihao <1060304770@qq.com>
Date: Thu, 9 Apr 2026 15:14:34 +0800
Subject: [PATCH 51/51] Apply suggestions from code review. Use params.dtype

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../models/qwen3_omni_moe_thinker/qwen3_omni_audio.py     | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index c81e1d5859..ff49ab160a 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -374,7 +374,9 @@ def encode(self, audio_items: List[AudioItem]):
     @torch.inference_mode()
     def check_long_audio_infer(self):
         """Exercise forward with mel length chosen so the conv loop runs once with batch dim == conv_chunksize."""
-        device = next(self.parameters()).device
+        params = next(self.parameters())
+        device = params.device
+        dtype = params.dtype
         frame_len = self.conv_chunksize * (self.n_window * 2)
         logger.info(
             "check_long_audio_infer: start frame_len=%s conv_chunksize=%s n_window=%s device=%s dtype=%s",
@@ -382,9 +384,9 @@ def check_long_audio_infer(self):
             self.conv_chunksize,
             self.n_window,
             device,
-            self.data_type,
+            dtype,
         )
-        input_features = torch.zeros(self.num_mel_bins, frame_len, device=device, dtype=self.data_type)
+        input_features = torch.zeros(self.num_mel_bins, frame_len, device=device, dtype=dtype)
         feature_lens = torch.tensor([frame_len], device=device, dtype=torch.long)
         out = self.forward(input_features, feature_lens=feature_lens)
         logger.info("check_long_audio_infer: done output_shape=%s", tuple(out.shape))