From c8b388845b80335e532f930ff09e8be41d050eaa Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 26 Mar 2026 08:17:31 +0000
Subject: [PATCH 01/51] qwen3_vl_moe support prefill_cudagraph
---
.../layer_infer/transformer_layer_infer.py | 39 ++++++++++++++++++-
1 file changed, 37 insertions(+), 2 deletions(-)
diff --git a/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py
index 391ee8bf6b..40d4bbc0ad 100644
--- a/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py
@@ -1,12 +1,14 @@
import torch
import torch.distributed as dist
from typing import Tuple
+from lightllm.common.basemodel.infer_struct import InferStateInfo
from lightllm.models.qwen2_vl.triton_kernel.mrope import mrope_triton_fused
from lightllm.models.qwen3_moe.layer_infer.transformer_layer_infer import Qwen3MOETransformerLayerInfer
from lightllm.models.qwen3_moe.layer_weights.transformer_layer_weight import Qwen3MOETransformerLayerWeight
from lightllm.models.qwen3_vl.infer_struct import Qwen3VLInferStateInfo
from lightllm.distributed import all_reduce
from lightllm.models.qwen3_vl.triton_kernel.deepstack_multimodal_emb import apply_deepstack_features
+from lightllm.utils.tensor_utils import tensor_to_no_ref_tensor
class Qwen3VLMOETransformerLayerInfer(Qwen3MOETransformerLayerInfer):
@@ -48,7 +50,7 @@ def context_forward(self, input_embdings, infer_state: Qwen3VLInferStateInfo, la
q, cache_kv = self._get_qkv(input1, infer_state, layer_weight)
input1 = None
self._post_cache_kv(cache_kv, infer_state, layer_weight)
- o = self._context_attention_kernel(q, cache_kv, infer_state, layer_weight)
+ o = self._context_attention_wrapper_run(q, cache_kv, infer_state, layer_weight)
q = None
o = self._get_o(o, infer_state, layer_weight)
if self.tp_world_size_ > 1:
@@ -62,9 +64,42 @@ def context_forward(self, input_embdings, infer_state: Qwen3VLInferStateInfo, la
if self.tp_world_size_ > 1:
all_reduce(ffn_out, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False)
input_embdings.add_(ffn_out.view(-1, self.embed_dim_))
- apply_deepstack_features(
+ self._apply_deepstack_features_wrapper_run(
input_embeddings=input_embdings,
infer_state=infer_state,
layer_num=self.layer_num_,
)
return input_embdings
+
+ def _apply_deepstack_features_wrapper_run(
+ self,
+ input_embeddings: torch.Tensor,
+ infer_state: InferStateInfo,
+ layer_num: int,
+ ):
+ if torch.cuda.is_current_stream_capturing():
+ input_embeddings = input_embeddings.contiguous()
+ _input_embeddings = tensor_to_no_ref_tensor(input_embeddings)
+ pre_capture_graph = infer_state.prefill_cuda_graph_get_current_capture_graph()
+ pre_capture_graph.__exit__(None, None, None)
+
+ infer_state.prefill_cuda_graph_create_graph_obj()
+ infer_state.prefill_cuda_graph_get_current_capture_graph().__enter__()
+
+ def apply_func(new_infer_state: InferStateInfo):
+ apply_deepstack_features(
+ input_embeddings=_input_embeddings,
+ infer_state=new_infer_state,
+ layer_num=layer_num,
+ )
+ return
+
+ infer_state.prefill_cuda_graph_add_cpu_runnning_func(func=apply_func, after_graph=pre_capture_graph)
+ else:
+ apply_deepstack_features(
+ input_embeddings=input_embeddings,
+ infer_state=infer_state,
+ layer_num=layer_num,
+ )
+
+ return
From e7fba3af30bb723dcf9909a6d06bbb9ff514134b Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Fri, 27 Mar 2026 05:17:57 +0000
Subject: [PATCH 02/51] add audio dp
---
lightllm/server/api_cli.py | 4 ++
lightllm/server/api_start.py | 21 +++++-
lightllm/server/audioserver/manager.py | 41 ++++++-----
.../audioserver/model_infer/model_rpc.py | 68 ++++++++++++++-----
lightllm/server/core/objs/start_args_type.py | 6 +-
5 files changed, 105 insertions(+), 35 deletions(-)
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
index d32da8097c..776fbc8247 100644
--- a/lightllm/server/api_cli.py
+++ b/lightllm/server/api_cli.py
@@ -426,6 +426,9 @@ def make_argument_parser() -> argparse.ArgumentParser:
parser.add_argument(
"--visual_infer_batch_size", type=int, default=None, help="number of images to process in each inference batch"
)
+ parser.add_argument(
+ "--audio_infer_batch_size", type=int, default=None, help="number of audios to process in each inference batch"
+ )
parser.add_argument(
"--visual_send_batch_size",
type=int,
@@ -440,6 +443,7 @@ def make_argument_parser() -> argparse.ArgumentParser:
)
parser.add_argument("--visual_tp", type=int, default=1, help="number of tensort parallel instances for ViT")
parser.add_argument("--visual_dp", type=int, default=1, help="number of data parallel instances for ViT")
+ parser.add_argument("--audio_dp", type=int, default=1, help="number of data parallel instances for audio encoder")
parser.add_argument(
"--visual_nccl_ports",
nargs="+",
diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
index 364f9ca281..180b16e658 100644
--- a/lightllm/server/api_start.py
+++ b/lightllm/server/api_start.py
@@ -188,6 +188,9 @@ def normal_or_p_d_start(args):
if args.visual_dp <= 0:
raise ValueError("visual_dp must be a positive integer.")
+ if args.audio_dp <= 0:
+ raise ValueError("audio_dp must be a positive integer.")
+
if args.visual_infer_batch_size is None:
args.visual_infer_batch_size = args.visual_dp
@@ -198,6 +201,15 @@ def normal_or_p_d_start(args):
f"a positive integer multiple of visual_dp ({args.visual_dp})"
)
+ if args.audio_infer_batch_size is None:
+ args.audio_infer_batch_size = args.audio_dp * 2
+
+ if args.audio_infer_batch_size // args.audio_dp < 1 or args.audio_infer_batch_size % args.audio_dp != 0:
+ raise ValueError(
+ f"audio_infer_batch_size ({args.audio_infer_batch_size}) must be "
+ f"a positive integer multiple of audio_dp ({args.audio_dp})"
+ )
+
if args.disable_chunked_prefill:
args.chunked_prefill_size = args.max_req_total_len
# 普通模式下
@@ -247,8 +259,10 @@ def normal_or_p_d_start(args):
ports_locker.lock_port()
node_world_size = args.tp // args.nnodes
+ audio_model_dp_ports_num = 0 if args.disable_audio else args.audio_dp
can_use_ports = alloc_can_use_network_port(
- num=10 + node_world_size + args.visual_dp * (args.visual_tp + 1), used_ports=already_uesd_ports
+ num=10 + node_world_size + args.visual_dp * (args.visual_tp + 1) + audio_model_dp_ports_num,
+ used_ports=already_uesd_ports,
)
logger.info(f"alloced ports: {can_use_ports}")
(
@@ -274,6 +288,9 @@ def normal_or_p_d_start(args):
visual_nccl_ports.append(can_use_ports[0])
can_use_ports = can_use_ports[1:]
+ audio_model_dp_ports = can_use_ports[0:audio_model_dp_ports_num]
+ can_use_ports = can_use_ports[audio_model_dp_ports_num:]
+
# 将申请好的端口放入args参数中
if args.nccl_port is None:
args.nccl_port = nccl_port
@@ -342,7 +359,7 @@ def normal_or_p_d_start(args):
start_audio_process,
],
start_args=[
- (args,),
+ (args, audio_model_dp_ports),
],
)
diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py
index bb0a745302..f7cb300aaf 100644
--- a/lightllm/server/audioserver/manager.py
+++ b/lightllm/server/audioserver/manager.py
@@ -26,7 +26,7 @@ class AudioManager:
def __init__(
self,
args: StartArgs,
- infer_batch_size=4,
+ audio_model_rpc_ports,
):
context = zmq.asyncio.Context(2)
@@ -45,29 +45,32 @@ def __init__(
self.waiting_reqs: List[GroupReqIndexes] = []
self.model_weightdir = args.model_dir
self.tp_world_size = args.tp
- self.world_size = 1
- self.infer_batch_size = infer_batch_size
+ self.audio_dp = args.audio_dp
+ self.infer_batch_size = args.audio_infer_batch_size
self.trust_remote_code = args.trust_remote_code
self.args = args
+ self.audio_model_rpc_ports = audio_model_rpc_ports or [None] * self.audio_dp
self.shm_req_manager = ShmReqManager()
+ self.model_rpcs: List[AudioModelRpcClient] = []
async def wait_to_model_ready(self):
-
- self.model_rpcs: List[AudioModelRpcClient] = []
- for rank_id in range(self.world_size):
- rpc_model = await start_model_process(world_size=self.world_size)
+ self.model_rpcs = []
+ for dp_rank_id in range(self.audio_dp):
+ rpc_model = await start_model_process(
+ world_size=self.audio_dp, port=self.audio_model_rpc_ports[dp_rank_id], device_id=dp_rank_id
+ )
self.model_rpcs.append(rpc_model)
init_model_ret = []
- for rank_id in range(self.world_size):
+ for dp_rank_id in range(self.audio_dp):
kvargs = {
"weight_dir": self.model_weightdir,
"trust_remote_code": self.trust_remote_code,
- "rank_id": rank_id,
+ "dp_rank_id": dp_rank_id,
"cache_port": self.cache_port,
"data_type": self.args.data_type,
}
- init_model_ret.append(self.model_rpcs[rank_id].init_model(kvargs))
+ init_model_ret.append(self.model_rpcs[dp_rank_id].init_model(kvargs))
await asyncio.gather(*init_model_ret)
return
@@ -75,7 +78,11 @@ async def infer_audios(self, audios: List[AudioItem]):
if len(audios) == 0:
return
- rets = [self.model_rpcs[tp_rank].encode(audios) for tp_rank in range(self.world_size)]
+ rets = []
+ for dp_rank_id in range(self.audio_dp):
+ assigned_audios = [audios[i] for i in range(dp_rank_id, len(audios), self.audio_dp)]
+ if assigned_audios:
+ rets.append(self.model_rpcs[dp_rank_id].encode(assigned_audios))
await asyncio.gather(*rets)
return
@@ -148,19 +155,21 @@ async def loop_for_netio_req(self):
def clean_up(self):
for model_rpc in self.model_rpcs:
- model_rpc.rpc_server_process.kill()
+ if model_rpc.rpc_server_process is not None:
+ model_rpc.rpc_server_process.kill()
for model_rpc in self.model_rpcs:
- model_rpc.rpc_server_process.join()
+ if model_rpc.rpc_server_process is not None:
+ model_rpc.rpc_server_process.join()
return
-def start_audio_process(args, pipe_writer):
+def start_audio_process(args, model_rpc_ports, pipe_writer):
# 注册graceful 退出的处理
graceful_registry(inspect.currentframe().f_code.co_name)
setproctitle.setproctitle(f"lightllm::{get_unique_server_name()}::audio_server")
+ audioserver = AudioManager(args=args, audio_model_rpc_ports=model_rpc_ports)
try:
- audioserver = AudioManager(args=args)
asyncio.run(audioserver.wait_to_model_ready())
except Exception as e:
logger.exception(str(e))
@@ -170,7 +179,7 @@ def start_audio_process(args, pipe_writer):
pipe_writer.send("init ok")
def handle_exception(loop, context):
- logger.exception(f"VisualServer Caught exception: {str(context)}")
+ logger.exception(f"AudioServer Caught exception: {str(context)}")
loop = asyncio.new_event_loop()
loop.set_exception_handler(handle_exception)
diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py
index a8a2c39c3e..cbd39666a0 100644
--- a/lightllm/server/audioserver/model_infer/model_rpc.py
+++ b/lightllm/server/audioserver/model_infer/model_rpc.py
@@ -1,22 +1,25 @@
import asyncio
import rpyc
+import socket
import torch
-from typing import Dict, List, Tuple
+import inspect
+from typing import List
+from rpyc.utils.classic import obtain
+from rpyc.utils.server import ThreadedServer
from transformers.configuration_utils import PretrainedConfig
from lightllm.models.whisper.whisper_audio import WhisperAudioModel
from lightllm.models.qwen3_omni_moe_thinker.qwen3_omni_audio import Qwen3OmniMoeAudioEncoder
from lightllm.server.multimodal_params import AudioItem
from lightllm.utils.infer_utils import set_random_seed
from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient
+from lightllm.utils.graceful_utils import graceful_registry
class AudioModelRpcServer(rpyc.Service):
def exposed_init_model(self, kvargs):
- # 注册graceful 退出的处理
- from lightllm.utils.graceful_utils import graceful_registry
- import inspect
-
- graceful_registry(inspect.currentframe().f_code.co_name)
+ kvargs = obtain(kvargs)
+ self.dp_rank_id = kvargs["dp_rank_id"]
+ torch.cuda.set_device(self.dp_rank_id)
weight_dir = kvargs["weight_dir"]
model_cfg, _ = PretrainedConfig.get_config_dict(weight_dir)
@@ -41,7 +44,7 @@ def exposed_init_model(self, kvargs):
# CpuEmbedCacheClient 的初始化需要依赖这个设置的环境信息。
from lightllm.utils.dist_utils import set_current_device_id
- set_current_device_id(torch.cuda.current_device())
+ set_current_device_id(self.dp_rank_id)
self.cpu_embed_cache_client = CpuEmbedCacheClient(
create_meta_data=False,
@@ -65,6 +68,8 @@ def forward(self, audios):
# @calculate_time(show=False, min_cost_ms=300)
def exposed_encode(self, audios):
+ torch.cuda.set_device(self.dp_rank_id)
+ audios = obtain(audios)
return self.forward(audios)
@@ -74,6 +79,7 @@ def __init__(self, model_rpc, world_size, rpc_server_process=None):
self.world_size = world_size
self.rpc_server_process = rpc_server_process
self.use_rpc = self.world_size != 1
+
if self.use_rpc:
def async_wrap(f):
@@ -82,7 +88,6 @@ def async_wrap(f):
async def _func(*args, **kwargs):
ans = f(*args, **kwargs)
await asyncio.to_thread(ans.wait)
- # raise if exception
return ans.value
return _func
@@ -95,21 +100,52 @@ async def _func(*args, **kwargs):
return
async def init_model(self, kvargs):
- ans: rpyc.AsyncResult = self._init_model(kvargs)
+ ans = self._init_model(kvargs)
if self.use_rpc:
- await ans
- return
- else:
- return
+ return await ans
+ return ans
async def encode(self, audios: List[AudioItem]):
ans = self._encode(audios)
if self.use_rpc:
return await ans
- else:
- return ans
+ return ans
+
+def _init_env(port, device_id):
+ graceful_registry(inspect.currentframe().f_code.co_name)
+ torch.cuda.set_device(device_id)
-async def start_model_process(world_size):
+ from lightllm.utils.dist_utils import set_current_device_id
+ import lightllm.utils.rpyc_fix_utils as _
+
+ set_current_device_id(device_id)
+ t = ThreadedServer(AudioModelRpcServer(), port=port, protocol_config={"allow_pickle": True})
+ t.start()
+ return
+
+
+async def start_model_process(world_size, port=None, device_id=None):
if world_size == 1:
return AudioModelRpcClient(AudioModelRpcServer(), world_size)
+
+ import multiprocessing
+
+ proc = multiprocessing.Process(target=_init_env, args=(port, device_id))
+ proc.start()
+ await asyncio.sleep(2)
+ repeat_count = 0
+ while repeat_count < 20:
+ try:
+ con = rpyc.connect("localhost", port, config={"allow_pickle": True})
+ con._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
+ break
+ except BaseException:
+ await asyncio.sleep(1)
+ repeat_count += 1
+
+ if repeat_count == 20:
+ raise Exception("init rpc env error!")
+
+ assert proc.is_alive()
+ return AudioModelRpcClient(con.root, world_size, rpc_server_process=proc)
diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
index 37c022f3a3..8411a14e3c 100644
--- a/lightllm/server/core/objs/start_args_type.py
+++ b/lightllm/server/core/objs/start_args_type.py
@@ -99,10 +99,12 @@ class StartArgs:
grouping_key: List[str] = field(default_factory=list)
push_interval: int = field(default=10)
visual_infer_batch_size: int = field(default=None)
+ audio_infer_batch_size: int = field(default=None)
visual_send_batch_size: int = field(default=1)
visual_gpu_ids: List[int] = field(default_factory=lambda: [0])
visual_tp: int = field(default=1)
visual_dp: int = field(default=1)
+ audio_dp: int = field(default=1)
visual_nccl_ports: List[int] = field(default=None)
enable_monitor_auth: bool = field(default=False)
disable_cudagraph: bool = field(default=False)
@@ -125,7 +127,9 @@ class StartArgs:
vit_att_backend: List[str] = field(
default=("auto",), metadata={"choices": ["auto", "triton", "fa3", "sdpa", "xformers"]}
)
- llm_kv_type: str = field(default="None", metadata={"choices": ["None", "int8kv", "int4kv", "fp8kv_sph", "fp8kv_spt"]})
+ llm_kv_type: str = field(
+ default="None", metadata={"choices": ["None", "int8kv", "int4kv", "fp8kv_sph", "fp8kv_spt"]}
+ )
llm_kv_quant_group_size: int = field(default=8)
sampling_backend: str = field(default="triton", metadata={"choices": ["triton", "sglang_kernel"]})
penalty_counter_mode: str = field(
From 671b5aa446b970c575c0f02ebb36d60f091e9ba8 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Fri, 27 Mar 2026 14:34:57 +0000
Subject: [PATCH 03/51] Add startup warmups for HTTP audio preload and per-rank
audio workers to remove first-request audio cold- start latency.
---
.../qwen3_omni_audio.py | 19 ++++
lightllm/models/whisper/whisper_audio.py | 18 ++++
lightllm/server/api_http.py | 7 +-
lightllm/server/audioserver/manager.py | 101 +++++++++++++++++-
.../audioserver/model_infer/model_rpc.py | 38 ++++++-
lightllm/server/httpserver/manager.py | 83 ++++++++++++++
lightllm/server/multimodal_params.py | 35 ++++++
7 files changed, 297 insertions(+), 4 deletions(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index 3573ecde86..6c620448b9 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -389,3 +389,22 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
if ids_to_set:
self.cache_client.root.set_items_embed(ids=ids_to_set)
torch.cuda.current_stream().synchronize()
+
+ @torch.no_grad()
+ def warmup(self, audio_bytes: bytes):
+ audio = BytesIO(audio_bytes)
+ audio, _ = librosa.load(audio, sr=self.processor.sampling_rate)
+ input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
+ if feature_attention_mask is not None:
+ audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
+ input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
+ else:
+ audio_feature_lengths = None
+
+ feature_lens = audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
+ _ = self.forward(
+ input_features,
+ feature_lens=feature_lens,
+ )
+ torch.cuda.current_stream().synchronize()
+ return
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
index 36c9408cb8..0493afdb9a 100644
--- a/lightllm/models/whisper/whisper_audio.py
+++ b/lightllm/models/whisper/whisper_audio.py
@@ -241,3 +241,21 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
if ids_to_set:
self.cache_client.root.set_items_embed(ids=ids_to_set)
torch.cuda.current_stream().synchronize()
+
+ @torch.no_grad()
+ def warmup(self, audio_bytes: bytes):
+ audio = BytesIO(audio_bytes)
+ audio, _ = librosa.load(audio, sr=16000)
+
+ from .defaults import MIN_AUDIO_LEN
+
+ if audio.shape[0] < MIN_AUDIO_LEN:
+ audio = np.pad(audio, (0, MIN_AUDIO_LEN - len(audio)), mode="constant", constant_values=0.0)
+
+ batch_audio_lens = np.array([min(audio.shape[0], self.max_length)], dtype=np.int32)
+ audios, audio_lens_after_cnn = self.audio_processor(
+ [audio], batch_audio_lens, sampling_rate=16000, return_tensors="pt"
+ )
+ _ = self.forward(audios, audio_lens_after_cnn)
+ torch.cuda.current_stream().synchronize()
+ return
diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py
index 230da5b369..6be738befc 100755
--- a/lightllm/server/api_http.py
+++ b/lightllm/server/api_http.py
@@ -40,7 +40,7 @@
from fastapi.responses import Response, StreamingResponse, JSONResponse
from lightllm.server.core.objs.sampling_params import SamplingParams
from lightllm.server.core.objs import StartArgs
-from .multimodal_params import MultimodalParams
+from .multimodal_params import MultimodalParams, warmup_audio_preload
from .httpserver.manager import HttpServerManager
from .httpserver_for_pd_master.manager import HttpServerManagerForPDMaster
from .api_lightllm import lightllm_get_score
@@ -359,6 +359,11 @@ async def startup_event():
logger.info("server start up")
loop = asyncio.get_event_loop()
g_objs.set_args(get_env_start_args())
+ if g_objs.args.enable_multimodal and not g_objs.args.disable_audio:
+ warmup_start = time.time()
+ logger.info("http_audio_preload_warmup_start")
+ await warmup_audio_preload()
+ logger.info(f"http_audio_preload_warmup_done elapsed_ms:{(time.time() - warmup_start) * 1000.0:.3f}")
loop.create_task(g_objs.httpserver_manager.handle_loop())
logger.info(f"server start up ok, loop use is {asyncio.get_event_loop()}")
return
diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py
index f7cb300aaf..b4fb002965 100644
--- a/lightllm/server/audioserver/manager.py
+++ b/lightllm/server/audioserver/manager.py
@@ -7,7 +7,8 @@
import socket
import inspect
import setproctitle
-from typing import List
+import time
+from typing import Dict, List
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
from lightllm.utils.log_utils import init_logger
@@ -52,6 +53,27 @@ def __init__(
self.audio_model_rpc_ports = audio_model_rpc_ports or [None] * self.audio_dp
self.shm_req_manager = ShmReqManager()
self.model_rpcs: List[AudioModelRpcClient] = []
+ self.req_stage_times: Dict[int, Dict[str, float]] = {}
+ self.next_module_port = args.multi_level_kv_cache_port if args.enable_cpu_cache else args.router_port
+
+ def _mark_req_stage(self, req_id: int, stage: str):
+ now = time.time()
+ req_stage_dict = self.req_stage_times.setdefault(req_id, {})
+ if "audio_recv" not in req_stage_dict:
+ req_stage_dict["audio_recv"] = now
+ req_stage_dict[stage] = now
+ return now - req_stage_dict["audio_recv"]
+
+ def _log_req_stage(self, req_id: int, stage: str, **kwargs):
+ elapsed_s = self._mark_req_stage(req_id, stage)
+ extras = " ".join(f"{k}:{v}" for k, v in kwargs.items())
+ suffix = f" {extras}" if extras else ""
+ logger.info(f"lightllm_req_id:{req_id} stage:{stage} elapsed_ms:{elapsed_s * 1000.0:.3f}{suffix}")
+ return
+
+ def _cleanup_req_stage(self, req_id: int):
+ self.req_stage_times.pop(req_id, None)
+ return
async def wait_to_model_ready(self):
self.model_rpcs = []
@@ -72,18 +94,37 @@ async def wait_to_model_ready(self):
}
init_model_ret.append(self.model_rpcs[dp_rank_id].init_model(kvargs))
await asyncio.gather(*init_model_ret)
+
+ warmup_start = time.time()
+ logger.info(f"audio_warmup_start audio_dp:{self.audio_dp}")
+
+ async def warmup_one_rank(dp_rank_id: int):
+ rank_start = time.time()
+ logger.info(f"audio_warmup_rank_start dp_rank_id:{dp_rank_id}")
+ await self.model_rpcs[dp_rank_id].warmup_model()
+ logger.info(
+ f"audio_warmup_rank_done dp_rank_id:{dp_rank_id} elapsed_ms:{(time.time() - rank_start) * 1000.0:.3f}"
+ )
+
+ await asyncio.gather(*[warmup_one_rank(dp_rank_id) for dp_rank_id in range(self.audio_dp)])
+ logger.info(f"audio_warmup_done elapsed_ms:{(time.time() - warmup_start) * 1000.0:.3f}")
return
async def infer_audios(self, audios: List[AudioItem]):
if len(audios) == 0:
return
+ infer_start = time.time()
rets = []
for dp_rank_id in range(self.audio_dp):
assigned_audios = [audios[i] for i in range(dp_rank_id, len(audios), self.audio_dp)]
if assigned_audios:
rets.append(self.model_rpcs[dp_rank_id].encode(assigned_audios))
await asyncio.gather(*rets)
+ logger.info(
+ f"audio_infer_batch_done audio_count:{len(audios)} audio_dp:{self.audio_dp} "
+ f"elapsed_ms:{(time.time() - infer_start) * 1000.0:.3f}"
+ )
return
@@ -96,6 +137,11 @@ async def loop_for_fwd(self):
audios_need_infer = []
while len(self.waiting_reqs) > 0:
group_req_indexes = self.waiting_reqs.pop(0)
+ self._log_req_stage(
+ group_req_indexes.group_req_id,
+ "audio_queue_picked",
+ waiting_queue_size=len(self.waiting_reqs),
+ )
shm_req = self.shm_req_manager.get_req_obj_by_index(group_req_indexes.shm_req_indexes[0])
disable_prompt_cache = shm_req.sample_params.disable_prompt_cache
is_aborted = shm_req.is_aborted
@@ -105,6 +151,7 @@ async def loop_for_fwd(self):
# 因为采用 shm 来映射所有的 req 对象以后,引用管理情况复杂了
# 需要一些一致的流程来保证不出现异步问题。
self.send_to_next_module.send_pyobj(group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
+ self._cleanup_req_stage(group_req_indexes.group_req_id)
continue
multimodal_params = group_req_indexes.multimodal_params
@@ -116,28 +163,74 @@ async def loop_for_fwd(self):
else:
ready_audio = obtain(self.cache_client.root.get_items_embed(audio_uuids))
+ current_req_has_pending_audio = False
for audio, ready in zip(multimodal_params.audios, ready_audio):
if not ready:
audios_need_infer.append(audio)
+ current_req_has_pending_audio = True
if len(audios_need_infer) == self.infer_batch_size:
+ batch_reqs = processing_group_reqs + (
+ [group_req_indexes] if current_req_has_pending_audio else []
+ )
+ batch_req_ids = [req.group_req_id for req in batch_reqs]
+ logger.info(
+ f"audio_batch_ready req_ids:{batch_req_ids} "
+ f"audio_count:{len(audios_need_infer)} infer_batch_size:{self.infer_batch_size}"
+ )
+ for batch_req_id in batch_req_ids:
+ self._log_req_stage(
+ batch_req_id, "audio_infer_start", batch_audio_count=len(audios_need_infer)
+ )
await self.infer_audios(audios_need_infer)
+ for batch_req_id in batch_req_ids:
+ self._log_req_stage(
+ batch_req_id, "audio_infer_done", batch_audio_count=len(audios_need_infer)
+ )
audios_need_infer = []
for _group_req_indexes in processing_group_reqs:
+ self._log_req_stage(
+ _group_req_indexes.group_req_id,
+ "audio_send_to_next_module",
+ target_port=self.next_module_port,
+ )
self.send_to_next_module.send_pyobj(
_group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL
)
+ self._cleanup_req_stage(_group_req_indexes.group_req_id)
processing_group_reqs = []
if len(audios_need_infer) == 0:
+ self._log_req_stage(
+ group_req_indexes.group_req_id,
+ "audio_send_to_next_module",
+ target_port=self.next_module_port,
+ pending_audio_count=0,
+ )
self.send_to_next_module.send_pyobj(group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
+ self._cleanup_req_stage(group_req_indexes.group_req_id)
else:
processing_group_reqs.append(group_req_indexes)
if len(audios_need_infer) > 0:
+ batch_req_ids = [req.group_req_id for req in processing_group_reqs]
+ logger.info(
+ f"audio_batch_ready req_ids:{batch_req_ids} "
+ f"audio_count:{len(audios_need_infer)} infer_batch_size:{self.infer_batch_size}"
+ )
+ for batch_req_id in batch_req_ids:
+ self._log_req_stage(batch_req_id, "audio_infer_start", batch_audio_count=len(audios_need_infer))
await self.infer_audios(audios_need_infer)
+ for batch_req_id in batch_req_ids:
+ self._log_req_stage(batch_req_id, "audio_infer_done", batch_audio_count=len(audios_need_infer))
for _group_req_indexes in processing_group_reqs:
+ self._log_req_stage(
+ _group_req_indexes.group_req_id,
+ "audio_send_to_next_module",
+ target_port=self.next_module_port,
+ )
self.send_to_next_module.send_pyobj(_group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
+ self._cleanup_req_stage(_group_req_indexes.group_req_id)
processing_group_reqs = []
audios_need_infer = []
@@ -149,6 +242,12 @@ async def loop_for_netio_req(self):
f"audio recv req id {recv_req.group_req_id} "
f"audio count {len(recv_req.multimodal_params.audios)}"
)
+ self._log_req_stage(
+ recv_req.group_req_id,
+ "audio_recv",
+ audio_count=len(recv_req.multimodal_params.audios),
+ waiting_queue_size=len(self.waiting_reqs),
+ )
self.waiting_reqs.append(recv_req)
else:
assert False, f"Error Req Inf {recv_req}"
diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py
index cbd39666a0..8db3be7f35 100644
--- a/lightllm/server/audioserver/model_infer/model_rpc.py
+++ b/lightllm/server/audioserver/model_infer/model_rpc.py
@@ -1,8 +1,12 @@
import asyncio
-import rpyc
+import inspect
+import io
import socket
+import wave
+
+import numpy as np
+import rpyc
import torch
-import inspect
from typing import List
from rpyc.utils.classic import obtain
from rpyc.utils.server import ThreadedServer
@@ -13,6 +17,21 @@
from lightllm.utils.infer_utils import set_random_seed
from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient
from lightllm.utils.graceful_utils import graceful_registry
+from lightllm.utils.log_utils import init_logger
+
+
+logger = init_logger(__name__)
+
+
+def _generate_silence_wav_bytes(sample_rate: int = 16000, num_samples: int = 16000) -> bytes:
+ samples = np.zeros(num_samples, dtype=np.int16)
+ buffer = io.BytesIO()
+ with wave.open(buffer, "wb") as wav_file:
+ wav_file.setnchannels(1)
+ wav_file.setsampwidth(2)
+ wav_file.setframerate(sample_rate)
+ wav_file.writeframes(samples.tobytes())
+ return buffer.getvalue()
class AudioModelRpcServer(rpyc.Service):
@@ -72,6 +91,13 @@ def exposed_encode(self, audios):
audios = obtain(audios)
return self.forward(audios)
+ def exposed_warmup_model(self):
+ torch.cuda.set_device(self.dp_rank_id)
+ warmup_audio = _generate_silence_wav_bytes()
+ self.model.warmup(warmup_audio)
+ logger.info(f"audio model warmup finished on dp_rank_id:{self.dp_rank_id}")
+ return
+
class AudioModelRpcClient:
def __init__(self, model_rpc, world_size, rpc_server_process=None):
@@ -94,9 +120,11 @@ async def _func(*args, **kwargs):
self._init_model = async_wrap(self.model.init_model)
self._encode = async_wrap(self.model.encode)
+ self._warmup_model = async_wrap(self.model.warmup_model)
else:
self._init_model = self.model.exposed_init_model
self._encode = self.model.exposed_encode
+ self._warmup_model = self.model.exposed_warmup_model
return
async def init_model(self, kvargs):
@@ -111,6 +139,12 @@ async def encode(self, audios: List[AudioItem]):
return await ans
return ans
+ async def warmup_model(self):
+ ans = self._warmup_model()
+ if self.use_rpc:
+ return await ans
+ return ans
+
def _init_env(port, device_id):
graceful_registry(inspect.currentframe().f_code.co_name)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index e28e4c93ad..3a818b0a39 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -124,6 +124,13 @@ def __init__(
self.latest_success_infer_time_mark.set_value(int(time.time()))
return
+ def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str, **kwargs):
+ cost_ms = (time.time() - start_time) * 1000.0
+ extras = " ".join(f"{k}:{v}" for k, v in kwargs.items())
+ suffix = f" {extras}" if extras else ""
+ logger.info(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}")
+ return
+
async def _alloc_resource(self, items, md5sums, token_nums, datas):
while True:
@@ -287,6 +294,10 @@ async def generate(
start_time = time.time()
request_headers = request.headers if request is not None else {}
group_request_id = self.alloc_req_id(sampling_params, is_health_req)
+ if request is not None:
+ request.state.lightllm_req_id = group_request_id
+ audio_count = len(multimodal_params.audios) if multimodal_params is not None else 0
+ image_count = len(multimodal_params.images) if multimodal_params is not None else 0
try:
original_multimodal_params = None
@@ -295,11 +306,26 @@ async def generate(
if self.pd_mode.is_P_or_NORMAL():
await multimodal_params.verify_and_preload(request)
+ self._log_stage_timing(
+ group_request_id,
+ start_time,
+ "verify_and_preload_done",
+ audio_count=audio_count,
+ image_count=image_count,
+ )
# 记录请求到达的相关信息
await self._log_req_header(request_headers, group_request_id)
# encode
prompt_ids = await self._encode(prompt, multimodal_params, sampling_params)
+ self._log_stage_timing(
+ group_request_id,
+ start_time,
+ "encode_done",
+ prompt_tokens=len(prompt_ids),
+ audio_count=audio_count,
+ image_count=image_count,
+ )
prompt_tokens = len(prompt_ids)
# 监控
@@ -308,6 +334,13 @@ async def generate(
self.metric_client.histogram_observe("lightllm_request_input_length", prompt_tokens)
self.metric_client.histogram_observe("lightllm_request_max_new_tokens", sampling_params.max_new_tokens)
prompt_ids = await self._check_and_repair_length(prompt_ids, sampling_params)
+ self._log_stage_timing(
+ group_request_id,
+ start_time,
+ "check_and_repair_length_done",
+ prompt_tokens=len(prompt_ids),
+ max_new_tokens=sampling_params.max_new_tokens,
+ )
if nixl_pd_upload_websocket is not None and not is_health_req and self.pd_mode.is_NP():
# 在 nixl pd 模式下的 p 节点, 为了更好的兼容多模态的推理流程,np 节点需要先上报其 encode 好的 prompt ids 信息,然后
@@ -355,6 +388,12 @@ async def generate(
chunked_prefill_size=self.args.chunked_prefill_size,
)
req_objs.append(req_obj)
+ self._log_stage_timing(
+ group_request_id,
+ start_time,
+ "shm_req_init_done",
+ req_count=len(req_objs),
+ )
logger.debug(
f"alloc shm_req for req_id {group_request_id}, "
@@ -368,6 +407,13 @@ async def generate(
await self.transfer_to_next_module_or_node(
prompt, sampling_params, original_multimodal_params, req_status.group_req_objs
)
+ self._log_stage_timing(
+ group_request_id,
+ start_time,
+ "request_forwarded",
+ has_audio=audio_count > 0,
+ has_image=image_count > 0,
+ )
results_generator = self._wait_to_token_package(
start_time,
@@ -445,7 +491,15 @@ async def _encode(
), "too many multimodal items!"
if multimodal_params.audios:
assert not self.args.disable_audio, "audio multimodal not enabled"
+ encode_start_time = time.time()
await self._alloc_multimodal_resources(multimodal_params, sampling_params)
+ log_req_id = getattr(sampling_params, "group_request_id", None)
+ logger.info(
+ f"lightllm_req_id:{log_req_id} "
+ f"stage:alloc_multimodal_resources_done "
+ f"elapsed_ms:{(time.time() - encode_start_time) * 1000.0:.3f} "
+ f"audio_count:{len(multimodal_params.audios)} image_count:{len(multimodal_params.images)}"
+ )
prompt_ids = self.tokenizer.encode(
prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens
)
@@ -539,20 +593,39 @@ async def transfer_to_next_module(
if self.pd_mode.is_P_or_NORMAL():
if not self.args.disable_vision:
+ logger.info(
+ f"lightllm_req_id:{group_req_objs.group_req_id} "
+ f"stage:transfer_to_visual "
+ f"target_port:{self.args.visual_port}"
+ )
self.send_to_visual.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL)
return
if not self.args.disable_audio:
+ logger.info(
+ f"lightllm_req_id:{group_req_objs.group_req_id} "
+ f"stage:transfer_to_audio "
+ f"target_port:{self.args.audio_port}"
+ )
self.send_to_audio.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL)
return
if self.args.enable_cpu_cache:
+ logger.info(
+ f"lightllm_req_id:{group_req_objs.group_req_id} "
+ f"stage:transfer_to_multi_level_kv_cache target_port:{self.args.multi_level_kv_cache_port}"
+ )
self.send_to_multi_level_kv_cache.send_pyobj(
group_req_objs.to_group_req_index(),
protocol=pickle.HIGHEST_PROTOCOL,
)
return
+ logger.info(
+ f"lightllm_req_id:{group_req_objs.group_req_id} "
+ f"stage:transfer_to_router "
+ f"target_port:{self.args.router_port}"
+ )
self.send_to_router.send_pyobj(
group_req_objs.to_group_req_index(),
protocol=pickle.HIGHEST_PROTOCOL,
@@ -561,6 +634,11 @@ async def transfer_to_next_module(
if self.pd_mode.is_D():
# 在 D 模式下,不需要传输真的多模态参数,因为其已经被 P 处理好了
+ logger.info(
+ f"lightllm_req_id:{group_req_objs.group_req_id} "
+ f"stage:transfer_to_router_from_decode "
+ f"target_port:{self.args.router_port}"
+ )
self.send_to_router.send_pyobj(
group_req_objs.to_group_req_index(),
protocol=pickle.HIGHEST_PROTOCOL,
@@ -619,6 +697,11 @@ async def _wait_to_token_package(
first_token_cost_ms = (time.time() - start_time) * 1000
is_first_token = False
self.first_time_costs.add(first_token_cost_ms)
+ logger.info(
+ f"lightllm_req_id:{group_request_id} "
+ f"stage:first_token_arrived elapsed_ms:{first_token_cost_ms:.3f} "
+ f"sub_req_id:{sub_req_id} prompt_tokens:{prompt_tokens}"
+ )
out_token_counter += 1
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 09a07455b3..cd9d652ab8 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -1,5 +1,7 @@
"""Multimodal parameters for text generation."""
import os
+import wave
+import time
import librosa
import base64
from typing import List
@@ -12,6 +14,17 @@
logger = init_logger(__name__)
+def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float = 1.0) -> bytes:
+ num_samples = max(1, int(sample_rate * duration_seconds))
+ with BytesIO() as buffer:
+ with wave.open(buffer, "wb") as wav_file:
+ wav_file.setnchannels(1)
+ wav_file.setsampwidth(2)
+ wav_file.setframerate(sample_rate)
+ wav_file.writeframes(b"\x00\x00" * num_samples)
+ return buffer.getvalue()
+
+
class AudioItem:
def __init__(self, **kwargs):
self._type = kwargs["type"]
@@ -32,6 +45,9 @@ def __init__(self, **kwargs):
async def preload(self, request: Request):
try:
+ req_id = getattr(getattr(request, "state", None), "lightllm_req_id", None)
+ preload_start = time.time()
+ source_ready_start = preload_start
if self._type == "url":
timeout = int(os.getenv("REQUEST_TIMEOUT", "5"))
proxy = os.getenv("REQUEST_PROXY", None)
@@ -40,13 +56,22 @@ async def preload(self, request: Request):
audio_data = base64.b64decode(self._data)
else:
raise ValueError(f"cannot read audio which type is {self._type}!")
+ source_ready_cost_ms = (time.time() - source_ready_start) * 1000.0
# check if valid audio bytes
+ decode_start = time.time()
audio_values, _ = librosa.load(BytesIO(audio_data), sr=16000)
+ decode_cost_ms = (time.time() - decode_start) * 1000.0
from lightllm.models.whisper.defaults import MIN_AUDIO_LEN
self.audio_length = max(audio_values.shape[0], MIN_AUDIO_LEN) # 如果音频过短,会被pad到480的长度
self._preload_data = audio_data
+ logger.info(
+ f"lightllm_req_id:{req_id} stage:audio_preload_done "
+ f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} "
+ f"source_type:{self._type} source_ready_ms:{source_ready_cost_ms:.3f} "
+ f"decode_ms:{decode_cost_ms:.3f} audio_length:{self.audio_length}"
+ )
return
except Exception as e:
@@ -184,3 +209,13 @@ def to_origin_dict(self):
ret["images"] = [i.to_origin_dict() for i in self.images]
ret["audios"] = [a.to_origin_dict() for a in self.audios]
return ret
+
+
+async def warmup_audio_preload():
+ warmup_audio = AudioItem(
+ type="base64",
+ data=base64.b64encode(generate_silence_wav_bytes()).decode("utf-8"),
+ )
+ await warmup_audio.preload(None)
+ warmup_audio.read()
+ return
From a3872599dc98eecd98e28915e0d77f09d96e61ec Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Mon, 30 Mar 2026 00:45:08 +0000
Subject: [PATCH 04/51] add http client cache
---
lightllm/utils/multimodal_utils.py | 35 +++++++++++++++++++-----------
1 file changed, 22 insertions(+), 13 deletions(-)
diff --git a/lightllm/utils/multimodal_utils.py b/lightllm/utils/multimodal_utils.py
index 14c8303273..6e3766f950 100644
--- a/lightllm/utils/multimodal_utils.py
+++ b/lightllm/utils/multimodal_utils.py
@@ -3,10 +3,14 @@
import httpx
from PIL import Image
from io import BytesIO
+from urllib.parse import urlparse
+from typing import Dict, Optional
from fastapi import Request
from lightllm.utils.log_utils import init_logger
logger = init_logger(__name__)
+_HTTP_CLIENTS: Dict[Optional[str], httpx.AsyncClient] = {}
+_LOCAL_RESOURCE_HOSTS = {"127.0.0.1", "localhost", "::1"}
def image2base64(img_str: str):
@@ -21,20 +25,25 @@ def image2base64(img_str: str):
async def fetch_resource(url, request: Request, timeout, proxy=None):
logger.info(f"Begin to download resource from url: {url}")
start_time = time.time()
- async with httpx.AsyncClient(proxy=proxy) as client:
- async with client.stream("GET", url, timeout=timeout) as response:
- response.raise_for_status()
- ans_bytes = []
- async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
- if request is not None and await request.is_disconnected():
- await response.aclose()
- raise Exception("Request disconnected. User cancelled download.")
- ans_bytes.append(chunk)
- # 接收的数据不能大于128M
- if len(ans_bytes) > 128:
- raise Exception(f"url {url} recv data is too big")
+ hostname = urlparse(url).hostname
+ effective_proxy = None if hostname in _LOCAL_RESOURCE_HOSTS else proxy
+ client = _HTTP_CLIENTS.get(effective_proxy)
+ if client is None:
+ client = httpx.AsyncClient(proxy=effective_proxy)
+ _HTTP_CLIENTS[effective_proxy] = client
+ async with client.stream("GET", url, timeout=timeout) as response:
+ response.raise_for_status()
+ ans_bytes = []
+ async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
+ if request is not None and await request.is_disconnected():
+ await response.aclose()
+ raise Exception("Request disconnected. User cancelled download.")
+ ans_bytes.append(chunk)
+ # 接收的数据不能大于128M
+ if len(ans_bytes) > 128:
+ raise Exception(f"url {url} recv data is too big")
- content = b"".join(ans_bytes)
+ content = b"".join(ans_bytes)
end_time = time.time()
cost_time = end_time - start_time
logger.info(f"Download url {url} resource cost time: {cost_time} seconds")
From cd89cd613117c33a5900dc2fb2466ea2d5599797 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Mon, 30 Mar 2026 00:48:15 +0000
Subject: [PATCH 05/51] reduce polling time
---
lightllm/server/audioserver/manager.py | 7 ++++++-
lightllm/server/router/manager.py | 9 +++------
2 files changed, 9 insertions(+), 7 deletions(-)
diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py
index b4fb002965..ac4058b643 100644
--- a/lightllm/server/audioserver/manager.py
+++ b/lightllm/server/audioserver/manager.py
@@ -55,6 +55,7 @@ def __init__(
self.model_rpcs: List[AudioModelRpcClient] = []
self.req_stage_times: Dict[int, Dict[str, float]] = {}
self.next_module_port = args.multi_level_kv_cache_port if args.enable_cpu_cache else args.router_port
+ self.waiting_reqs_event = asyncio.Event()
def _mark_req_stage(self, req_id: int, stage: str):
now = time.time()
@@ -131,7 +132,10 @@ async def infer_audios(self, audios: List[AudioItem]):
async def loop_for_fwd(self):
while True:
if len(self.waiting_reqs) == 0:
- await asyncio.sleep(0.01) # 10ms
+ self.waiting_reqs_event.clear()
+ if len(self.waiting_reqs) == 0:
+ await self.waiting_reqs_event.wait()
+ continue
else:
processing_group_reqs = []
audios_need_infer = []
@@ -249,6 +253,7 @@ async def loop_for_netio_req(self):
waiting_queue_size=len(self.waiting_reqs),
)
self.waiting_reqs.append(recv_req)
+ self.waiting_reqs_event.set()
else:
assert False, f"Error Req Inf {recv_req}"
diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py
index 0d2705fab2..f5e0b8df9a 100644
--- a/lightllm/server/router/manager.py
+++ b/lightllm/server/router/manager.py
@@ -316,8 +316,7 @@ async def _add_batch(self, batch: Batch):
# 添加新请求
reqs = [r.to_router_rpc_obj() for r in batch.reqs]
while not self.shm_reqs_io_buffer.is_empty():
- await asyncio.sleep(0.02)
-
+ await asyncio.sleep(0.001)
self.shm_reqs_io_buffer.write_obj(reqs)
self.shm_reqs_io_buffer.set_ready()
logger.debug(f"Prefill Batch: {batch.simple_log()} \n")
@@ -326,8 +325,7 @@ async def _add_batch(self, batch: Batch):
async def _aborted_reqs(self, aborted_reqs: List[Req]):
cmds = [AbortedReqCmd(req_id=r.request_id) for r in aborted_reqs]
while not self.shm_reqs_io_buffer.is_empty():
- await asyncio.sleep(0.02)
-
+ await asyncio.sleep(0.001)
self.shm_reqs_io_buffer.write_obj(cmds)
self.shm_reqs_io_buffer.set_ready()
return
@@ -335,8 +333,7 @@ async def _aborted_reqs(self, aborted_reqs: List[Req]):
async def _stop_str_matched_reqs(self, stop_str_matched_reqs: List[Req]):
cmds = [StopStrMatchedReqCmd(req_id=r.request_id) for r in stop_str_matched_reqs]
while not self.shm_reqs_io_buffer.is_empty():
- await asyncio.sleep(0.02)
-
+ await asyncio.sleep(0.001)
self.shm_reqs_io_buffer.write_obj(cmds)
self.shm_reqs_io_buffer.set_ready()
return
From 4788980006dc0be673e151d0c9c8f4cf12afcfdf Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Mon, 30 Mar 2026 01:27:53 +0000
Subject: [PATCH 06/51] Optimize audio shm payload handling and cache lookups
---
.../qwen3_omni_audio.py | 47 ++++++++++++-------
lightllm/models/whisper/whisper_audio.py | 20 ++------
.../embed_cache/impl/naive_memory_cache.py | 2 +
lightllm/server/httpserver/manager.py | 21 ++++-----
lightllm/server/multimodal_params.py | 37 ++++++++++++++-
5 files changed, 82 insertions(+), 45 deletions(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index 6c620448b9..424a768bbf 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -1,6 +1,7 @@
import os
import json
import math
+import time
import torch
import rpyc
import librosa
@@ -10,16 +11,18 @@
from safetensors import safe_open
from torch.nn import functional as F
from typing import Callable, Optional, Union, List
-from rpyc.utils.classic import obtain
-
from transformers.activations import ACT2FN
-from lightllm.server.multimodal_params import AudioItem
+from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload
from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient
from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor
+from lightllm.utils.log_utils import init_logger
+
+
+logger = init_logger(__name__)
def _get_feat_extract_output_lengths(input_lengths):
@@ -338,6 +341,11 @@ def forward(
return hidden_states
def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedCacheClient):
+ encode_start = time.time()
+ load_shm_cost = 0.0
+ preprocess_cost = 0.0
+ forward_cost = 0.0
+ cache_copy_cost = 0.0
uuids = []
items: List[AudioItem] = []
per_audio_features: List[torch.Tensor] = []
@@ -345,12 +353,14 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
if isinstance(item, AudioItem):
uuids.append(item.uuid)
items.append(item)
+ load_start = time.time()
audio_data = read_shm(get_shm_name_data(item.uuid))
- audio = BytesIO(audio_data)
- audio, _ = librosa.load(audio, sr=self.processor.sampling_rate)
+ audio = load_audio_from_shm_payload(audio_data, item.extra_params, self.processor.sampling_rate)
+ load_shm_cost += time.time() - load_start
else:
raise ValueError(f"cannot read audio which type is {type(item)}!")
+ preprocess_start = time.time()
input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
if feature_attention_mask is not None:
audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
@@ -361,22 +371,19 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
feature_lens = (
audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
)
+ preprocess_cost += time.time() - preprocess_start
+ forward_start = time.time()
audio_features = self.forward(
input_features,
feature_lens=feature_lens,
)
+ forward_cost += time.time() - forward_start
per_audio_features.append(audio_features)
- ready_audio = obtain(self.cache_client.root.get_items_embed(uuids))
- ids_to_set = []
- for i, ready in enumerate(ready_audio):
- if ready:
- continue
-
- uid = uuids[i]
+ cache_copy_start = time.time()
+ for i, uid in enumerate(uuids):
item = items[i]
-
cur_embed = per_audio_features[i]
cpu_embed_cache_client.copy_to_cache(
embed_tensor=cur_embed, start_index_in_cache=item.start_index_in_embed_cache
@@ -384,11 +391,19 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
assert (
item.token_num == cur_embed.shape[0]
), f"audio token num not match {item.token_num} vs {cur_embed.shape[0]} "
- ids_to_set.append(uid)
- if ids_to_set:
- self.cache_client.root.set_items_embed(ids=ids_to_set)
+ if uuids:
torch.cuda.current_stream().synchronize()
+ self.cache_client.root.set_items_embed(ids=uuids)
+ cache_copy_cost += time.time() - cache_copy_start
+ logger.info(
+ f"audio_encode_batch_done audio_count:{len(audio_items)} "
+ f"load_shm_ms:{load_shm_cost * 1000.0:.3f} "
+ f"preprocess_ms:{preprocess_cost * 1000.0:.3f} "
+ f"forward_ms:{forward_cost * 1000.0:.3f} "
+ f"cache_ms:{cache_copy_cost * 1000.0:.3f} "
+ f"elapsed_ms:{(time.time() - encode_start) * 1000.0:.3f}"
+ )
@torch.no_grad()
def warmup(self, audio_bytes: bytes):
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
index 0493afdb9a..a94d22dd0c 100644
--- a/lightllm/models/whisper/whisper_audio.py
+++ b/lightllm/models/whisper/whisper_audio.py
@@ -10,8 +10,7 @@
from safetensors.torch import load_file
from transformers.processing_utils import ProcessorMixin
from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
-from lightllm.server.multimodal_params import AudioItem
-from rpyc.utils.classic import obtain
+from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload
from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient
# tokenizer_class removed
@@ -175,8 +174,7 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
uuids.append(item.uuid)
items.append(item)
audio_data = read_shm(get_shm_name_data(item.uuid))
- audio = BytesIO(audio_data)
- audio, _ = librosa.load(audio, sr=16000)
+ audio = load_audio_from_shm_payload(audio_data, item.extra_params, 16000)
else:
raise ValueError(f"cannot read audio which type is {type(item)}!")
@@ -222,25 +220,17 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
continue
per_audio_embeds[owner].append(audios[chunk_idx][:token_len])
- ready_audio = obtain(self.cache_client.root.get_items_embed(uuids))
- ids_to_set = []
- for i, ready in enumerate(ready_audio):
- if ready:
- continue
-
- uid = uuids[i]
+ for i, uid in enumerate(uuids):
item = items[i]
-
# 拼接该 audio 的所有 chunk embedding
cur_embed = torch.cat(per_audio_embeds[i], dim=0)
cpu_embed_cache_client.copy_to_cache(
embed_tensor=cur_embed, start_index_in_cache=item.start_index_in_embed_cache
)
- ids_to_set.append(uid)
- if ids_to_set:
- self.cache_client.root.set_items_embed(ids=ids_to_set)
+ if uuids:
torch.cuda.current_stream().synchronize()
+ self.cache_client.root.set_items_embed(ids=uuids)
@torch.no_grad()
def warmup(self, audio_bytes: bytes):
diff --git a/lightllm/server/embed_cache/impl/naive_memory_cache.py b/lightllm/server/embed_cache/impl/naive_memory_cache.py
index 5ad26fbcc8..ff7b2374b2 100644
--- a/lightllm/server/embed_cache/impl/naive_memory_cache.py
+++ b/lightllm/server/embed_cache/impl/naive_memory_cache.py
@@ -205,6 +205,8 @@ def alloc(self, md5sum_list: list[str], token_num_list: list[int]) -> Optional[l
"token_id": rec.token_id,
"start_index_in_embed_cache": rec.mem_block.start,
"token_num": rec.token_num,
+ "data_ready": rec.data,
+ "embed_ready": rec.embed,
}
)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 3a818b0a39..8b3be9b0e8 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -144,23 +144,17 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas):
logger.error(str(records) + "and try to set --embed_cache_storage_size bigger")
raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger")
- uid_list = []
- for item, rec in zip(items, records):
+ update_data_ids = []
+ for item, rec, data in zip(items, records, datas):
item: Union[ImageItem, AudioItem] = item
item.uuid = rec["id"]
item.token_id = rec["token_id"]
item.token_num = rec["token_num"]
item.start_index_in_embed_cache = rec["start_index_in_embed_cache"]
- uid_list.append(rec["id"])
-
- ready_flags = obtain(self.cache_client.root.get_items_data(uid_list))
- update_data_ids = []
-
- for uid, ready, data in zip(uid_list, ready_flags, datas):
- if not ready:
- create_shm(get_shm_name_data(uid), data)
- update_data_ids.append(uid)
+ if not rec["data_ready"]:
+ create_shm(get_shm_name_data(rec["id"]), data)
+ update_data_ids.append(rec["id"])
if update_data_ids:
self.cache_client.root.set_items_data(update_data_ids)
@@ -188,7 +182,10 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
data = audio.read()
token_num = self.tokenizer.get_audio_token_length(audio)
- md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(audio.extra_params)))
+ payload_md5 = audio.extra_params.get("audio_payload_md5")
+ if payload_md5 is None:
+ payload_md5 = hashlib.md5(data).hexdigest()
+ md5sum = payload_md5 + "_" + str(hash(frozendict(audio.extra_params)))
md5sums.append(md5sum)
tokens_nums.append(token_num)
datas.append(data)
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index cd9d652ab8..13a26d9b57 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -4,6 +4,8 @@
import time
import librosa
import base64
+import hashlib
+import numpy as np
from typing import List
from io import BytesIO
from PIL import Image
@@ -12,6 +14,9 @@
from lightllm.utils.log_utils import init_logger
logger = init_logger(__name__)
+RAW_AUDIO_SHM_FORMAT = "raw_audio_bytes"
+WAVEFORM_F32_SHM_FORMAT = "waveform_f32"
+AUDIO_SHM_USE_RAW_ENV = "LIGHTLLM_AUDIO_SHM_USE_RAW"
def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float = 1.0) -> bytes:
@@ -25,6 +30,22 @@ def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float
return buffer.getvalue()
+def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray:
+ audio_shm_format = extra_params.get("audio_shm_format", RAW_AUDIO_SHM_FORMAT)
+ if audio_shm_format == WAVEFORM_F32_SHM_FORMAT:
+ num_samples = int(extra_params.get("audio_num_samples", 0))
+ if num_samples > 0:
+ return np.frombuffer(audio_data, dtype=np.float32, count=num_samples)
+ return np.frombuffer(audio_data, dtype=np.float32)
+
+ audio, _ = librosa.load(BytesIO(audio_data), sr=sample_rate)
+ return np.asarray(audio, dtype=np.float32)
+
+
+def should_use_raw_audio_shm() -> bool:
+ return os.getenv(AUDIO_SHM_USE_RAW_ENV, "0") == "1"
+
+
class AudioItem:
def __init__(self, **kwargs):
self._type = kwargs["type"]
@@ -61,16 +82,28 @@ async def preload(self, request: Request):
# check if valid audio bytes
decode_start = time.time()
audio_values, _ = librosa.load(BytesIO(audio_data), sr=16000)
+ audio_values = np.asarray(audio_values, dtype=np.float32)
decode_cost_ms = (time.time() - decode_start) * 1000.0
from lightllm.models.whisper.defaults import MIN_AUDIO_LEN
self.audio_length = max(audio_values.shape[0], MIN_AUDIO_LEN) # 如果音频过短,会被pad到480的长度
- self._preload_data = audio_data
+ if should_use_raw_audio_shm():
+ self.extra_params["audio_shm_format"] = RAW_AUDIO_SHM_FORMAT
+ self.extra_params.pop("audio_sample_rate", None)
+ self.extra_params.pop("audio_num_samples", None)
+ self._preload_data = audio_data
+ else:
+ self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT
+ self.extra_params["audio_sample_rate"] = 16000
+ self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
+ self._preload_data = audio_values.tobytes()
+ self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest()
logger.info(
f"lightllm_req_id:{req_id} stage:audio_preload_done "
f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} "
f"source_type:{self._type} source_ready_ms:{source_ready_cost_ms:.3f} "
- f"decode_ms:{decode_cost_ms:.3f} audio_length:{self.audio_length}"
+ f"decode_ms:{decode_cost_ms:.3f} audio_length:{self.audio_length} "
+ f"shm_format:{self.extra_params['audio_shm_format']}"
)
return
From 7b05403af6df9f42d294c5b28ee76fd7c4b89342 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Mon, 30 Mar 2026 01:31:21 +0000
Subject: [PATCH 07/51] cache hann_window/mel_filters
---
.../qwen3_omni_moe_thinker/audio_process.py | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
index 833cc8f4b0..e9dc931886 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
@@ -46,10 +46,25 @@ def __init__(
norm="slaney",
mel_scale="slaney",
)
+ self._hann_window_cache = {}
+ self._mel_filters_cache = {}
+
+ def _get_cached_feature_tensors(self, device: Union[str, torch.device]):
+ device_key = str(device)
+ window = self._hann_window_cache.get(device_key)
+ if window is None:
+ window = torch.hann_window(self.n_fft, device=device)
+ self._hann_window_cache[device_key] = window
+
+ mel_filters = self._mel_filters_cache.get(device_key)
+ if mel_filters is None:
+ mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
+ self._mel_filters_cache[device_key] = mel_filters
+ return window, mel_filters
def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu") -> np.ndarray:
waveform = torch.from_numpy(waveform).to(device, torch.float32)
- window = torch.hann_window(self.n_fft, device=device)
+ window, mel_filters = self._get_cached_feature_tensors(device)
if self.dither != 0.0:
waveform += self.dither * torch.randn(waveform.shape, dtype=waveform.dtype, device=waveform.device)
@@ -57,7 +72,6 @@ def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu
stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
magnitudes = stft[..., :-1].abs() ** 2
- mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
mel_spec = mel_filters.T @ magnitudes
log_spec = torch.clamp(mel_spec, min=1e-10).log10()
From 713c45d912aec4b6955aaf6e55be0ef8e5705dd6 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Mon, 30 Mar 2026 08:26:52 +0000
Subject: [PATCH 08/51] Fix audio preload config to follow tokenizer settings
---
.../common/basemodel/multimodal_tokenizer.py | 1 +
lightllm/models/internvl/model.py | 5 +++++
.../models/qwen3_omni_moe_thinker/model.py | 5 +++++
lightllm/server/api_http.py | 4 +++-
lightllm/server/httpserver/manager.py | 4 +++-
lightllm/server/multimodal_params.py | 20 +++++++++++--------
6 files changed, 29 insertions(+), 10 deletions(-)
diff --git a/lightllm/common/basemodel/multimodal_tokenizer.py b/lightllm/common/basemodel/multimodal_tokenizer.py
index cdcbd7f089..872a418bf7 100644
--- a/lightllm/common/basemodel/multimodal_tokenizer.py
+++ b/lightllm/common/basemodel/multimodal_tokenizer.py
@@ -33,6 +33,7 @@
class BaseMultiModalTokenizer(ABC):
def __init__(self, tokenizer, **kwargs):
self.tokenizer = tokenizer
+ self.audio_preload_config = None
def __getattr__(self, name):
obj_dict = object.__getattribute__(self, "__dict__")
diff --git a/lightllm/models/internvl/model.py b/lightllm/models/internvl/model.py
index ccb76d3512..70c797aeb8 100644
--- a/lightllm/models/internvl/model.py
+++ b/lightllm/models/internvl/model.py
@@ -50,6 +50,11 @@ def __init__(self, tokenizer, model_cfg, **kwargs):
self.audio_min_length = MIN_AUDIO_LEN
self.audio_max_length = 16000 * 30
+ self.audio_preload_config = {
+ "sampling_rate": 16000,
+ "hop_length": 160,
+ "min_audio_len": int(self.audio_min_length),
+ }
def init_imageitem_extral_params(
self, img: ImageItem, multi_params: MultimodalParams, sampling_params: SamplingParams
diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py
index a1419f83ef..4a5131bbf1 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/model.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/model.py
@@ -42,6 +42,11 @@ def __init__(self, tokenizer=None, processor=None, **kwargs):
self.sampling_rate = self.audio_processor.sampling_rate
self.n_samples = self.audio_processor.n_samples
self.hop_length = self.audio_processor.hop_length
+ self.audio_preload_config = {
+ "sampling_rate": int(self.sampling_rate),
+ "hop_length": int(self.hop_length),
+ "min_audio_len": int(MIN_AUDIO_LEN),
+ }
self.image_start_id = kwargs["model_cfg"]["vision_start_token_id"]
self.image_end_id = kwargs["model_cfg"]["vision_end_token_id"]
diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py
index 6be738befc..cb7619fbe5 100755
--- a/lightllm/server/api_http.py
+++ b/lightllm/server/api_http.py
@@ -272,7 +272,9 @@ async def tokens(request: Request):
multimodal_params_dict = request_dict.get("multimodal_params", {})
multimodal_params = MultimodalParams(**multimodal_params_dict)
- await multimodal_params.verify_and_preload(request)
+ await multimodal_params.verify_and_preload(
+ request, audio_preload_config=getattr(g_objs.httpserver_manager.tokenizer, "audio_preload_config", None)
+ )
return JSONResponse(
{
"ntokens": g_objs.httpserver_manager.tokens(
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 8b3be9b0e8..9a6864774a 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -302,7 +302,9 @@ async def generate(
original_multimodal_params = copy.deepcopy(multimodal_params)
if self.pd_mode.is_P_or_NORMAL():
- await multimodal_params.verify_and_preload(request)
+ await multimodal_params.verify_and_preload(
+ request, audio_preload_config=getattr(self.tokenizer, "audio_preload_config", None)
+ )
self._log_stage_timing(
group_request_id,
start_time,
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 13a26d9b57..440bff06c5 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -17,6 +17,8 @@
RAW_AUDIO_SHM_FORMAT = "raw_audio_bytes"
WAVEFORM_F32_SHM_FORMAT = "waveform_f32"
AUDIO_SHM_USE_RAW_ENV = "LIGHTLLM_AUDIO_SHM_USE_RAW"
+DEFAULT_AUDIO_SAMPLE_RATE = 16000
+DEFAULT_MIN_AUDIO_LEN = 480
def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float = 1.0) -> bytes:
@@ -64,7 +66,7 @@ def __init__(self, **kwargs):
self._preload_data = None
self.extra_params = {}
- async def preload(self, request: Request):
+ async def preload(self, request: Request, audio_preload_config: dict = None):
try:
req_id = getattr(getattr(request, "state", None), "lightllm_req_id", None)
preload_start = time.time()
@@ -79,14 +81,16 @@ async def preload(self, request: Request):
raise ValueError(f"cannot read audio which type is {self._type}!")
source_ready_cost_ms = (time.time() - source_ready_start) * 1000.0
+ audio_preload_config = audio_preload_config or {}
+ target_sample_rate = int(audio_preload_config.get("sampling_rate", DEFAULT_AUDIO_SAMPLE_RATE))
+ min_audio_len = int(audio_preload_config.get("min_audio_len", DEFAULT_MIN_AUDIO_LEN))
+
# check if valid audio bytes
decode_start = time.time()
- audio_values, _ = librosa.load(BytesIO(audio_data), sr=16000)
+ audio_values, _ = librosa.load(BytesIO(audio_data), sr=target_sample_rate)
audio_values = np.asarray(audio_values, dtype=np.float32)
decode_cost_ms = (time.time() - decode_start) * 1000.0
- from lightllm.models.whisper.defaults import MIN_AUDIO_LEN
-
- self.audio_length = max(audio_values.shape[0], MIN_AUDIO_LEN) # 如果音频过短,会被pad到480的长度
+ self.audio_length = max(audio_values.shape[0], min_audio_len)
if should_use_raw_audio_shm():
self.extra_params["audio_shm_format"] = RAW_AUDIO_SHM_FORMAT
self.extra_params.pop("audio_sample_rate", None)
@@ -94,7 +98,7 @@ async def preload(self, request: Request):
self._preload_data = audio_data
else:
self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT
- self.extra_params["audio_sample_rate"] = 16000
+ self.extra_params["audio_sample_rate"] = target_sample_rate
self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
self._preload_data = audio_values.tobytes()
self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest()
@@ -221,11 +225,11 @@ def __init__(
self.audios = [AudioItem(**a) for a in audios]
return
- async def verify_and_preload(self, request: Request):
+ async def verify_and_preload(self, request: Request, audio_preload_config: dict = None):
for image in self.images:
await image.preload(request)
for audio in self.audios:
- await audio.preload(request)
+ await audio.preload(request, audio_preload_config=audio_preload_config)
return
def to_dict(self):
From 65a3ec67bb94bd41b604e415f2e227ae35c81ef9 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 31 Mar 2026 09:27:15 +0000
Subject: [PATCH 09/51] Optimize qwen3 omni audio preprocessing fast path
---
.../qwen3_omni_moe_thinker/audio_process.py | 23 +++++++++++
.../qwen3_omni_audio.py | 41 +++++++++++--------
lightllm/server/multimodal_params.py | 16 +++++++-
3 files changed, 61 insertions(+), 19 deletions(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
index e9dc931886..42eae8edb5 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
@@ -105,6 +105,29 @@ def zero_mean_unit_var_norm(
return normed_input_values
+ def _preprocess_single_padded(
+ self,
+ raw_speech: np.ndarray,
+ num_frames: int,
+ device: Optional[str] = "cpu",
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ waveform = np.asarray(raw_speech, dtype=np.float32)
+ if waveform.ndim != 1:
+ raise ValueError(f"single audio fast path expects 1D waveform, got shape={waveform.shape}")
+
+ extracted = self._torch_extract_fbank_features(waveform[None, :], device)
+ extracted = np.asarray(extracted, dtype=np.float32)
+ if extracted.ndim != 3:
+ raise ValueError(f"unexpected extracted feature shape={extracted.shape}")
+
+ if extracted.shape[-1] < num_frames:
+ raise ValueError(f"feature frames {extracted.shape[-1]} < requested num_frames {num_frames}")
+
+ compact_features = torch.from_numpy(extracted[:, :, :num_frames]).to(device="cuda", dtype=torch.bfloat16)
+ compact_features = compact_features[0].contiguous()
+ feature_lens = torch.tensor([num_frames], device="cuda", dtype=torch.long)
+ return compact_features, feature_lens
+
def _preprocess(
self,
raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index 424a768bbf..f3cd0525eb 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -13,7 +13,7 @@
from typing import Callable, Optional, Union, List
from transformers.activations import ACT2FN
-from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload
+from lightllm.server.multimodal_params import AudioItem, WAVEFORM_F32_SHM_FORMAT, load_audio_from_shm_payload
from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient
from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
@@ -356,21 +356,27 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
load_start = time.time()
audio_data = read_shm(get_shm_name_data(item.uuid))
audio = load_audio_from_shm_payload(audio_data, item.extra_params, self.processor.sampling_rate)
+ audio_num_frames = item.extra_params.get("audio_num_frames")
load_shm_cost += time.time() - load_start
else:
raise ValueError(f"cannot read audio which type is {type(item)}!")
preprocess_start = time.time()
- input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
- if feature_attention_mask is not None:
- audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
- input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
+ if audio_num_frames is not None and item.extra_params.get("audio_shm_format") == WAVEFORM_F32_SHM_FORMAT:
+ input_features, feature_lens = self.processor._preprocess_single_padded(
+ audio, int(audio_num_frames), device="cpu"
+ )
else:
- audio_feature_lengths = None
-
- feature_lens = (
- audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
- )
+ input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
+ if feature_attention_mask is not None:
+ audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
+ input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
+ else:
+ audio_feature_lengths = None
+
+ feature_lens = (
+ audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
+ )
preprocess_cost += time.time() - preprocess_start
forward_start = time.time()
@@ -409,14 +415,13 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
def warmup(self, audio_bytes: bytes):
audio = BytesIO(audio_bytes)
audio, _ = librosa.load(audio, sr=self.processor.sampling_rate)
- input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
- if feature_attention_mask is not None:
- audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
- input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
- else:
- audio_feature_lengths = None
-
- feature_lens = audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
+ num_frames = (max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length
+ padded_len = ((max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length) * (
+ self.processor.hop_length
+ )
+ if padded_len > audio.shape[0]:
+ audio = np.pad(audio, (0, padded_len - audio.shape[0]), mode="constant", constant_values=0.0)
+ input_features, feature_lens = self.processor._preprocess_single_padded(audio, num_frames, device="cpu")
_ = self.forward(
input_features,
feature_lens=feature_lens,
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 440bff06c5..da5d239c6a 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -18,6 +18,7 @@
WAVEFORM_F32_SHM_FORMAT = "waveform_f32"
AUDIO_SHM_USE_RAW_ENV = "LIGHTLLM_AUDIO_SHM_USE_RAW"
DEFAULT_AUDIO_SAMPLE_RATE = 16000
+DEFAULT_AUDIO_HOP_LENGTH = 160
DEFAULT_MIN_AUDIO_LEN = 480
@@ -83,6 +84,7 @@ async def preload(self, request: Request, audio_preload_config: dict = None):
audio_preload_config = audio_preload_config or {}
target_sample_rate = int(audio_preload_config.get("sampling_rate", DEFAULT_AUDIO_SAMPLE_RATE))
+ hop_length = int(audio_preload_config.get("hop_length", DEFAULT_AUDIO_HOP_LENGTH))
min_audio_len = int(audio_preload_config.get("min_audio_len", DEFAULT_MIN_AUDIO_LEN))
# check if valid audio bytes
@@ -90,16 +92,28 @@ async def preload(self, request: Request, audio_preload_config: dict = None):
audio_values, _ = librosa.load(BytesIO(audio_data), sr=target_sample_rate)
audio_values = np.asarray(audio_values, dtype=np.float32)
decode_cost_ms = (time.time() - decode_start) * 1000.0
- self.audio_length = max(audio_values.shape[0], min_audio_len)
+ effective_audio_len = max(audio_values.shape[0], min_audio_len)
+ padded_audio_len = ((effective_audio_len + hop_length - 1) // hop_length) * hop_length
+ if padded_audio_len > audio_values.shape[0]:
+ audio_values = np.pad(
+ audio_values,
+ (0, padded_audio_len - audio_values.shape[0]),
+ mode="constant",
+ constant_values=0.0,
+ )
+
+ self.audio_length = effective_audio_len
if should_use_raw_audio_shm():
self.extra_params["audio_shm_format"] = RAW_AUDIO_SHM_FORMAT
self.extra_params.pop("audio_sample_rate", None)
self.extra_params.pop("audio_num_samples", None)
+ self.extra_params.pop("audio_num_frames", None)
self._preload_data = audio_data
else:
self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT
self.extra_params["audio_sample_rate"] = target_sample_rate
self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
+ self.extra_params["audio_num_frames"] = int((effective_audio_len + hop_length - 1) // hop_length)
self._preload_data = audio_values.tobytes()
self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest()
logger.info(
From 2e480081b77a6166a89d908ad5a465e3eaefe0fd Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 31 Mar 2026 09:27:58 +0000
Subject: [PATCH 10/51] Add audio server fast path for single pending requests
---
lightllm/server/audioserver/manager.py | 32 ++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py
index ac4058b643..d54856c265 100644
--- a/lightllm/server/audioserver/manager.py
+++ b/lightllm/server/audioserver/manager.py
@@ -167,6 +167,38 @@ async def loop_for_fwd(self):
else:
ready_audio = obtain(self.cache_client.root.get_items_embed(audio_uuids))
+ pending_audios = [audio for audio, ready in zip(multimodal_params.audios, ready_audio) if not ready]
+ if (
+ pending_audios
+ and len(processing_group_reqs) == 0
+ and len(self.waiting_reqs) == 0
+ and len(pending_audios) < self.infer_batch_size
+ ):
+ logger.info(
+ f"audio_batch_ready req_ids:[{group_req_indexes.group_req_id}] "
+ f"audio_count:{len(pending_audios)} infer_batch_size:{self.infer_batch_size} fast_path:1"
+ )
+ self._log_req_stage(
+ group_req_indexes.group_req_id,
+ "audio_infer_start",
+ batch_audio_count=len(pending_audios),
+ )
+ await self.infer_audios(pending_audios)
+ self._log_req_stage(
+ group_req_indexes.group_req_id,
+ "audio_infer_done",
+ batch_audio_count=len(pending_audios),
+ )
+ self._log_req_stage(
+ group_req_indexes.group_req_id,
+ "audio_send_to_next_module",
+ target_port=self.next_module_port,
+ fast_path=1,
+ )
+ self.send_to_next_module.send_pyobj(group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
+ self._cleanup_req_stage(group_req_indexes.group_req_id)
+ continue
+
current_req_has_pending_audio = False
for audio, ready in zip(multimodal_params.audios, ready_audio):
if not ready:
From 456a71aab0722a646945b5154d02a14420fd14a2 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Wed, 1 Apr 2026 02:46:02 +0000
Subject: [PATCH 11/51] fix num_frames
---
lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 2 +-
lightllm/server/multimodal_params.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index f3cd0525eb..04839e0ce8 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -415,7 +415,7 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC
def warmup(self, audio_bytes: bytes):
audio = BytesIO(audio_bytes)
audio, _ = librosa.load(audio, sr=self.processor.sampling_rate)
- num_frames = (max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length
+ num_frames = max(audio.shape[0], 480) // self.processor.hop_length
padded_len = ((max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length) * (
self.processor.hop_length
)
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index da5d239c6a..ad70443ca7 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -113,7 +113,7 @@ async def preload(self, request: Request, audio_preload_config: dict = None):
self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT
self.extra_params["audio_sample_rate"] = target_sample_rate
self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
- self.extra_params["audio_num_frames"] = int((effective_audio_len + hop_length - 1) // hop_length)
+ self.extra_params["audio_num_frames"] = int(effective_audio_len // hop_length)
self._preload_data = audio_values.tobytes()
self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest()
logger.info(
From 479367d3466aa582fb920abef62c7de9adac2abc Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 2 Apr 2026 01:55:59 +0000
Subject: [PATCH 12/51] tune fp8
---
...8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json | 92 ++++++++++++++++
...8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json | 92 ++++++++++++++++
...p8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json | 101 ++++++++++++++++++
...p8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json | 101 ++++++++++++++++++
.../{topk_num=8}_NVIDIA_GeForce_RTX_5090.json | 46 ++++++++
...6,topk_num=8}_NVIDIA_GeForce_RTX_5090.json | 68 ++++++++++++
...6,topk_num=8}_NVIDIA_GeForce_RTX_5090.json | 62 +++++++++++
...orch.float16}_NVIDIA_GeForce_RTX_5090.json | 42 ++++++++
...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json | 46 ++++++++
...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json | 90 ++++++++++++++++
...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json | 90 ++++++++++++++++
...orch.float16}_NVIDIA_GeForce_RTX_5090.json | 62 +++++++++++
...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json | 68 ++++++++++++
13 files changed, 960 insertions(+)
create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json
create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..2a46877c76
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,92 @@
+{
+ "1024": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "128": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "16384": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "256": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "512": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "64": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "800": {
+ "BLOCK_SIZE_K": 32,
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 5,
+ "num_warps": 4
+ },
+ "8192": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 2,
+ "num_warps": 4
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..7372d5c322
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,92 @@
+{
+ "1": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 16,
+ "NEED_TRANS": false,
+ "num_stages": 5,
+ "num_warps": 4
+ },
+ "100": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "GROUP_SIZE_M": 16,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "128": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 5,
+ "num_warps": 4
+ },
+ "16": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "256": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "32": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "64": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 5,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 5,
+ "num_warps": 8
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..569382ce2f
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,101 @@
+{
+ "1": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 16,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "100": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": true,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": true,
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "128": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 16,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "16": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 16,
+ "NEED_TRANS": true,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "256": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "32": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 16,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "64": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": true,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": true,
+ "num_stages": 3,
+ "num_warps": 8
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..1456fd0b4b
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,101 @@
+{
+ "1024": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": true,
+ "num_stages": 4,
+ "num_warps": 8
+ },
+ "128": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "16384": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": true,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "256": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "32768": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "512": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": false,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "64": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "GROUP_SIZE_M": 1,
+ "NEED_TRANS": false,
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 16,
+ "NEED_TRANS": true,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "800": {
+ "BLOCK_SIZE_K": 64,
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "GROUP_SIZE_M": 64,
+ "NEED_TRANS": true,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "8192": {
+ "BLOCK_SIZE_K": 128,
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "GROUP_SIZE_M": 32,
+ "NEED_TRANS": false,
+ "num_stages": 2,
+ "num_warps": 4
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..0f5983241f
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,46 @@
+{
+ "1": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 1
+ },
+ "100": {
+ "BLOCK_SIZE": 256,
+ "num_warps": 4
+ },
+ "1024": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 4
+ },
+ "128": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 8
+ },
+ "16": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 4
+ },
+ "2048": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 4
+ },
+ "256": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 4
+ },
+ "32": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 8
+ },
+ "4096": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 8
+ },
+ "64": {
+ "BLOCK_SIZE": 128,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_SIZE": 256,
+ "num_warps": 8
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..3612e98183
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,68 @@
+{
+ "1": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 2,
+ "num_warps": 8
+ },
+ "100": {
+ "BLOCK_DIM": 512,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 4
+ },
+ "1024": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 1
+ },
+ "128": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 2,
+ "num_warps": 8
+ },
+ "16": {
+ "BLOCK_DIM": 512,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 8
+ },
+ "2048": {
+ "BLOCK_DIM": 128,
+ "BLOCK_M": 2,
+ "NUM_STAGE": 1,
+ "num_warps": 1
+ },
+ "256": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 2
+ },
+ "32": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 4
+ },
+ "4096": {
+ "BLOCK_DIM": 128,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 4,
+ "num_warps": 4
+ },
+ "64": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 2
+ },
+ "8": {
+ "BLOCK_DIM": 1024,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 4,
+ "num_warps": 4
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..ff46525471
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,62 @@
+{
+ "1": {
+ "BLOCK_DIM": 128,
+ "BLOCK_M": 16,
+ "NUM_STAGE": 1,
+ "num_warps": 1
+ },
+ "100": {
+ "BLOCK_DIM": 512,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 2,
+ "num_warps": 16
+ },
+ "1024": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 1
+ },
+ "128": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 1
+ },
+ "16": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 2,
+ "num_warps": 16
+ },
+ "2048": {
+ "BLOCK_DIM": 512,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 8
+ },
+ "256": {
+ "BLOCK_DIM": 512,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 2
+ },
+ "32": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 1
+ },
+ "64": {
+ "BLOCK_DIM": 512,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 1,
+ "num_warps": 8
+ },
+ "8": {
+ "BLOCK_DIM": 256,
+ "BLOCK_M": 1,
+ "NUM_STAGE": 2,
+ "num_warps": 16
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..e3eb000004
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,42 @@
+{
+ "1": {
+ "num_stages": 1,
+ "num_warps": 1
+ },
+ "100": {
+ "num_stages": 2,
+ "num_warps": 1
+ },
+ "1024": {
+ "num_stages": 5,
+ "num_warps": 2
+ },
+ "128": {
+ "num_stages": 4,
+ "num_warps": 1
+ },
+ "16": {
+ "num_stages": 1,
+ "num_warps": 1
+ },
+ "2048": {
+ "num_stages": 4,
+ "num_warps": 1
+ },
+ "256": {
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "32": {
+ "num_stages": 5,
+ "num_warps": 1
+ },
+ "64": {
+ "num_stages": 5,
+ "num_warps": 1
+ },
+ "8": {
+ "num_stages": 1,
+ "num_warps": 1
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..9d20b4ea6b
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,46 @@
+{
+ "1": {
+ "num_stages": 4,
+ "num_warps": 2
+ },
+ "100": {
+ "num_stages": 1,
+ "num_warps": 1
+ },
+ "1024": {
+ "num_stages": 5,
+ "num_warps": 2
+ },
+ "128": {
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "16": {
+ "num_stages": 5,
+ "num_warps": 4
+ },
+ "2048": {
+ "num_stages": 3,
+ "num_warps": 2
+ },
+ "256": {
+ "num_stages": 2,
+ "num_warps": 2
+ },
+ "32": {
+ "num_stages": 4,
+ "num_warps": 1
+ },
+ "4096": {
+ "num_stages": 3,
+ "num_warps": 2
+ },
+ "64": {
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "8": {
+ "num_stages": 4,
+ "num_warps": 2
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..fdb476db92
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,90 @@
+{
+ "1": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 8,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "100": {
+ "BLOCK_K": 64,
+ "BLOCK_M": 32,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 8
+ },
+ "1024": {
+ "BLOCK_K": 64,
+ "BLOCK_M": 64,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 2,
+ "num_warps": 4
+ },
+ "128": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 32,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 5,
+ "num_warps": 4
+ },
+ "16": {
+ "BLOCK_K": 256,
+ "BLOCK_M": 8,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 8
+ },
+ "2048": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 64,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "256": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 64,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "32": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 16,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "4096": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 64,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "64": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 16,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 8,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 8
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..5f06f89508
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,90 @@
+{
+ "1": {
+ "BLOCK_K": 256,
+ "BLOCK_M": 8,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 8
+ },
+ "100": {
+ "BLOCK_K": 64,
+ "BLOCK_M": 32,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 8
+ },
+ "1024": {
+ "BLOCK_K": 64,
+ "BLOCK_M": 64,
+ "BLOCK_N": 128,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "128": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 32,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "16": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 16,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 5,
+ "num_warps": 4
+ },
+ "2048": {
+ "BLOCK_K": 64,
+ "BLOCK_M": 64,
+ "BLOCK_N": 128,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "256": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 64,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 8
+ },
+ "32": {
+ "BLOCK_K": 128,
+ "BLOCK_M": 16,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "4096": {
+ "BLOCK_K": 64,
+ "BLOCK_M": 64,
+ "BLOCK_N": 128,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 4
+ },
+ "64": {
+ "BLOCK_K": 256,
+ "BLOCK_M": 32,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 4,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_K": 256,
+ "BLOCK_M": 8,
+ "BLOCK_N": 64,
+ "GROUP_M": 8,
+ "num_stages": 3,
+ "num_warps": 8
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..d0b540f69e
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,62 @@
+{
+ "1024": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 1,
+ "num_warps": 4
+ },
+ "128": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 64,
+ "NUM_STAGES": 2,
+ "num_warps": 4
+ },
+ "16384": {
+ "BLOCK_M": 8,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ },
+ "2048": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 1,
+ "num_warps": 4
+ },
+ "256": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 64,
+ "NUM_STAGES": 1,
+ "num_warps": 4
+ },
+ "512": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 1,
+ "num_warps": 4
+ },
+ "64": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 64,
+ "NUM_STAGES": 1,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 32,
+ "NUM_STAGES": 1,
+ "num_warps": 8
+ },
+ "800": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 2,
+ "num_warps": 4
+ },
+ "8192": {
+ "BLOCK_M": 8,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
new file mode 100644
index 0000000000..6c5307023b
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json
@@ -0,0 +1,68 @@
+{
+ "1024": {
+ "BLOCK_M": 8,
+ "BLOCK_N": 64,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ },
+ "128": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 64,
+ "NUM_STAGES": 4,
+ "num_warps": 4
+ },
+ "16384": {
+ "BLOCK_M": 8,
+ "BLOCK_N": 128,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ },
+ "2048": {
+ "BLOCK_M": 8,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ },
+ "256": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ },
+ "32768": {
+ "BLOCK_M": 8,
+ "BLOCK_N": 128,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ },
+ "512": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 2,
+ "num_warps": 4
+ },
+ "64": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 4,
+ "num_warps": 4
+ },
+ "8": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 128,
+ "NUM_STAGES": 1,
+ "num_warps": 1
+ },
+ "800": {
+ "BLOCK_M": 1,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 4,
+ "num_warps": 1
+ },
+ "8192": {
+ "BLOCK_M": 8,
+ "BLOCK_N": 256,
+ "NUM_STAGES": 1,
+ "num_warps": 1
+ }
+}
\ No newline at end of file
From 2c09aa270edea34b29f73cae2109103d75073c92 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 2 Apr 2026 02:15:11 +0000
Subject: [PATCH 13/51] set default model
---
lightllm/server/api_models.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py
index 3d9a6bc8ed..3651bf4b64 100644
--- a/lightllm/server/api_models.py
+++ b/lightllm/server/api_models.py
@@ -187,7 +187,7 @@ def apply_loaded_defaults(cls, data: Any):
class ChatCompletionRequest(BaseModel):
- model: str
+ model: str = "default"
messages: List[ChatCompletionMessageParam]
function_call: Optional[str] = "none"
temperature: Optional[float] = 1
From 5168dae05ca72ebfdf51ff75fd1a109310677db2 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 2 Apr 2026 08:55:53 +0000
Subject: [PATCH 14/51] add prompt_text_cache to QWen3OmniTokenizer
---
lightllm/models/qwen3_omni_moe_thinker/model.py | 17 ++++++++++++++++-
1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py
index 4a5131bbf1..6ae73fd1d1 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/model.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/model.py
@@ -1,6 +1,7 @@
import os
import json
import librosa
+from collections import OrderedDict
from io import BytesIO
from lightllm.common.build_utils import repair_config
from lightllm.models.registry import ModelRegistry
@@ -30,6 +31,8 @@
class QWen3OmniTokenizer(QWen3VLTokenizer):
def __init__(self, tokenizer=None, processor=None, **kwargs):
self.tokenizer = tokenizer
+ self._prompt_encode_cache = OrderedDict()
+ self._prompt_encode_cache_capacity = 64
# image
self.image_processor = processor.image_processor
self.min_pixel = self.image_processor.min_pixels
@@ -71,6 +74,18 @@ def get_audio_token_length(self, audio: AudioItem):
# print(f"token_num is {token_num} n_samples is {self.n_samples} hop_length is {self.hop_length}")
return token_num
+ def _encode_prompt_text(self, prompt: str):
+ cached_ids = self._prompt_encode_cache.get(prompt)
+ if cached_ids is not None:
+ self._prompt_encode_cache.move_to_end(prompt)
+ return list(cached_ids)
+
+ origin_ids = self.tokenizer.encode(prompt)
+ self._prompt_encode_cache[prompt] = tuple(origin_ids)
+ if len(self._prompt_encode_cache) > self._prompt_encode_cache_capacity:
+ self._prompt_encode_cache.popitem(last=False)
+ return origin_ids
+
def _caclu_audio_token_num(self, input_audio_len: int):
_mel_len = input_audio_len // int(self.hop_length)
input_lengths_leave = _mel_len % 100
@@ -79,7 +94,7 @@ def _caclu_audio_token_num(self, input_audio_len: int):
return output_lengths
def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs):
- origin_ids = self.tokenizer.encode(prompt)
+ origin_ids = self._encode_prompt_text(prompt)
#
->
origin_ids = [token for token in origin_ids if token not in (self.image_token_id, self.audio_token_id)]
From 167f8b0e7449cc5a15755f0fe92edb5f5e95cd7f Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 2 Apr 2026 08:56:50 +0000
Subject: [PATCH 15/51] multi images or audios use asyncio
---
lightllm/server/multimodal_params.py | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index ad70443ca7..ce166b5980 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -1,4 +1,5 @@
"""Multimodal parameters for text generation."""
+import asyncio
import os
import wave
import time
@@ -240,10 +241,12 @@ def __init__(
return
async def verify_and_preload(self, request: Request, audio_preload_config: dict = None):
- for image in self.images:
- await image.preload(request)
- for audio in self.audios:
- await audio.preload(request, audio_preload_config=audio_preload_config)
+ preload_coroutines = [image.preload(request) for image in self.images]
+ preload_coroutines.extend(
+ audio.preload(request, audio_preload_config=audio_preload_config) for audio in self.audios
+ )
+ if preload_coroutines:
+ await asyncio.gather(*preload_coroutines)
return
def to_dict(self):
From 30d86034554ade2d9fe350986c76a31526d2b4cc Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 2 Apr 2026 08:58:05 +0000
Subject: [PATCH 16/51] single file without _resource_lock
---
lightllm/server/httpserver/manager.py | 107 +++++++++++++++++---------
1 file changed, 69 insertions(+), 38 deletions(-)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 9a6864774a..d7490ebfcd 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -131,6 +131,36 @@ def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str
logger.info(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}")
return
+ def _prepare_multimodal_resource_inputs(
+ self, multimodal_params: MultimodalParams, sampling_params: SamplingParams
+ ) -> Tuple[List[Union[ImageItem, AudioItem]], List[str], List[int], List[bytes]]:
+ items, md5sums, tokens_nums, datas = [], [], [], []
+
+ for img in multimodal_params.images:
+ self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
+ data = img.read()
+ token_num = self.tokenizer.get_image_token_length(img)
+ md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
+ md5sums.append(md5sum)
+ tokens_nums.append(token_num)
+ datas.append(data)
+ items.append(img)
+
+ for audio in multimodal_params.audios:
+ self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
+ data = audio.read()
+ token_num = self.tokenizer.get_audio_token_length(audio)
+ payload_md5 = audio.extra_params.get("audio_payload_md5")
+ if payload_md5 is None:
+ payload_md5 = hashlib.md5(data).hexdigest()
+ md5sum = payload_md5 + "_" + str(hash(frozendict(audio.extra_params)))
+ md5sums.append(md5sum)
+ tokens_nums.append(token_num)
+ datas.append(data)
+ items.append(audio)
+
+ return items, md5sums, tokens_nums, datas
+
async def _alloc_resource(self, items, md5sums, token_nums, datas):
while True:
@@ -163,34 +193,16 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas):
async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams):
# 只有 P 和 NORMAL 节点需要真的管理多模态资源
if self.pd_mode.is_P_or_NORMAL():
+ items, md5sums, tokens_nums, datas = self._prepare_multimodal_resource_inputs(
+ multimodal_params, sampling_params
+ )
+ if len(items) <= 1:
+ await self._alloc_resource(items, md5sums, tokens_nums, datas)
+ return
# 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity,从而造成死锁的问题。
# 如果不加任何锁,假如请求1和请求2都有6张图片,而cache_capacity为10,
# 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图,将会资源竞争产生死锁。
async with self._resource_lock:
- items, md5sums, tokens_nums, datas = [], [], [], []
- for img in multimodal_params.images:
- self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
- data = img.read()
- # must after init_imageitem_extral_params
- token_num = self.tokenizer.get_image_token_length(img)
- md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
- md5sums.append(md5sum)
- tokens_nums.append(token_num)
- datas.append(data)
- items.append(img)
- for audio in multimodal_params.audios:
- self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
- data = audio.read()
- token_num = self.tokenizer.get_audio_token_length(audio)
- payload_md5 = audio.extra_params.get("audio_payload_md5")
- if payload_md5 is None:
- payload_md5 = hashlib.md5(data).hexdigest()
- md5sum = payload_md5 + "_" + str(hash(frozendict(audio.extra_params)))
- md5sums.append(md5sum)
- tokens_nums.append(token_num)
- datas.append(data)
- items.append(audio)
-
await self._alloc_resource(items, md5sums, tokens_nums, datas)
return
@@ -295,6 +307,13 @@ async def generate(
request.state.lightllm_req_id = group_request_id
audio_count = len(multimodal_params.audios) if multimodal_params is not None else 0
image_count = len(multimodal_params.images) if multimodal_params is not None else 0
+ self._log_stage_timing(
+ group_request_id,
+ start_time,
+ "received",
+ has_audio=audio_count > 0,
+ has_image=image_count > 0,
+ )
try:
original_multimodal_params = None
@@ -316,7 +335,7 @@ async def generate(
# 记录请求到达的相关信息
await self._log_req_header(request_headers, group_request_id)
# encode
- prompt_ids = await self._encode(prompt, multimodal_params, sampling_params)
+ prompt_ids = await self._encode(prompt, multimodal_params, sampling_params, start_time=start_time)
self._log_stage_timing(
group_request_id,
start_time,
@@ -481,7 +500,11 @@ async def _log_req_header(self, request_headers, group_request_id: int):
return
async def _encode(
- self, prompt: Union[str, List[int]], multimodal_params: MultimodalParams, sampling_params: SamplingParams
+ self,
+ prompt: Union[str, List[int]],
+ multimodal_params: MultimodalParams,
+ sampling_params: SamplingParams,
+ start_time: Optional[float] = None,
):
if isinstance(prompt, str):
if self.enable_multimodal:
@@ -490,15 +513,23 @@ async def _encode(
), "too many multimodal items!"
if multimodal_params.audios:
assert not self.args.disable_audio, "audio multimodal not enabled"
- encode_start_time = time.time()
await self._alloc_multimodal_resources(multimodal_params, sampling_params)
log_req_id = getattr(sampling_params, "group_request_id", None)
- logger.info(
- f"lightllm_req_id:{log_req_id} "
- f"stage:alloc_multimodal_resources_done "
- f"elapsed_ms:{(time.time() - encode_start_time) * 1000.0:.3f} "
- f"audio_count:{len(multimodal_params.audios)} image_count:{len(multimodal_params.images)}"
- )
+ if start_time is None:
+ logger.info(
+ f"lightllm_req_id:{log_req_id} "
+ f"stage:alloc_multimodal_resources_done "
+ f"elapsed_ms:0.000 "
+ f"audio_count:{len(multimodal_params.audios)} image_count:{len(multimodal_params.images)}"
+ )
+ else:
+ self._log_stage_timing(
+ log_req_id,
+ start_time,
+ "alloc_multimodal_resources_done",
+ audio_count=len(multimodal_params.audios),
+ image_count=len(multimodal_params.images),
+ )
prompt_ids = self.tokenizer.encode(
prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens
)
@@ -592,7 +623,7 @@ async def transfer_to_next_module(
if self.pd_mode.is_P_or_NORMAL():
if not self.args.disable_vision:
- logger.info(
+ logger.debug(
f"lightllm_req_id:{group_req_objs.group_req_id} "
f"stage:transfer_to_visual "
f"target_port:{self.args.visual_port}"
@@ -601,7 +632,7 @@ async def transfer_to_next_module(
return
if not self.args.disable_audio:
- logger.info(
+ logger.debug(
f"lightllm_req_id:{group_req_objs.group_req_id} "
f"stage:transfer_to_audio "
f"target_port:{self.args.audio_port}"
@@ -610,7 +641,7 @@ async def transfer_to_next_module(
return
if self.args.enable_cpu_cache:
- logger.info(
+ logger.debug(
f"lightllm_req_id:{group_req_objs.group_req_id} "
f"stage:transfer_to_multi_level_kv_cache target_port:{self.args.multi_level_kv_cache_port}"
)
@@ -620,7 +651,7 @@ async def transfer_to_next_module(
)
return
- logger.info(
+ logger.debug(
f"lightllm_req_id:{group_req_objs.group_req_id} "
f"stage:transfer_to_router "
f"target_port:{self.args.router_port}"
@@ -633,7 +664,7 @@ async def transfer_to_next_module(
if self.pd_mode.is_D():
# 在 D 模式下,不需要传输真的多模态参数,因为其已经被 P 处理好了
- logger.info(
+ logger.debug(
f"lightllm_req_id:{group_req_objs.group_req_id} "
f"stage:transfer_to_router_from_decode "
f"target_port:{self.args.router_port}"
From db3e63b4ddb827003371d4e14650cdd3374415d3 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 2 Apr 2026 09:00:00 +0000
Subject: [PATCH 17/51] use deque instead of list
---
lightllm/server/audioserver/manager.py | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py
index d54856c265..a8ccb29891 100644
--- a/lightllm/server/audioserver/manager.py
+++ b/lightllm/server/audioserver/manager.py
@@ -8,6 +8,7 @@
import inspect
import setproctitle
import time
+from collections import deque
from typing import Dict, List
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
@@ -43,7 +44,7 @@ def __init__(
self.cache_client = rpyc.connect("localhost", args.cache_port, config={"allow_pickle": True})
self.cache_client._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
self.cache_port = args.cache_port
- self.waiting_reqs: List[GroupReqIndexes] = []
+ self.waiting_reqs = deque()
self.model_weightdir = args.model_dir
self.tp_world_size = args.tp
self.audio_dp = args.audio_dp
@@ -140,7 +141,7 @@ async def loop_for_fwd(self):
processing_group_reqs = []
audios_need_infer = []
while len(self.waiting_reqs) > 0:
- group_req_indexes = self.waiting_reqs.pop(0)
+ group_req_indexes = self.waiting_reqs.popleft()
self._log_req_stage(
group_req_indexes.group_req_id,
"audio_queue_picked",
@@ -174,7 +175,7 @@ async def loop_for_fwd(self):
and len(self.waiting_reqs) == 0
and len(pending_audios) < self.infer_batch_size
):
- logger.info(
+ logger.debug(
f"audio_batch_ready req_ids:[{group_req_indexes.group_req_id}] "
f"audio_count:{len(pending_audios)} infer_batch_size:{self.infer_batch_size} fast_path:1"
)
@@ -210,7 +211,7 @@ async def loop_for_fwd(self):
[group_req_indexes] if current_req_has_pending_audio else []
)
batch_req_ids = [req.group_req_id for req in batch_reqs]
- logger.info(
+ logger.debug(
f"audio_batch_ready req_ids:{batch_req_ids} "
f"audio_count:{len(audios_need_infer)} infer_batch_size:{self.infer_batch_size}"
)
@@ -250,7 +251,7 @@ async def loop_for_fwd(self):
if len(audios_need_infer) > 0:
batch_req_ids = [req.group_req_id for req in processing_group_reqs]
- logger.info(
+ logger.debug(
f"audio_batch_ready req_ids:{batch_req_ids} "
f"audio_count:{len(audios_need_infer)} infer_batch_size:{self.infer_batch_size}"
)
@@ -274,7 +275,7 @@ async def loop_for_netio_req(self):
while True:
recv_req: GroupReqIndexes = await self.zmq_recv_socket.recv_pyobj()
if isinstance(recv_req, GroupReqIndexes):
- logger.info(
+ logger.debug(
f"audio recv req id {recv_req.group_req_id} "
f"audio count {len(recv_req.multimodal_params.audios)}"
)
From 878c2f938267f81fb5edc989f31ad93659758df4 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Fri, 3 Apr 2026 04:50:45 +0000
Subject: [PATCH 18/51] chore: format merged audio/httpserver files
---
lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 2 +-
lightllm/models/whisper/whisper_audio.py | 3 ++-
lightllm/server/httpserver/manager.py | 6 ------
3 files changed, 3 insertions(+), 8 deletions(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index c14df5ff9d..c08dd68a2f 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -101,7 +101,7 @@ def __init__(self, d_model, encoder_attention_heads, attention_dropout):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {self.num_heads})."
)
- self.scaling = self.head_dim ** -0.5
+ self.scaling = self.head_dim**-0.5
self.attention_dropout = 0.0
self.is_decoder = False
self.is_causal = False
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
index 7eb2948281..750bf8e158 100644
--- a/lightllm/models/whisper/whisper_audio.py
+++ b/lightllm/models/whisper/whisper_audio.py
@@ -11,6 +11,7 @@
from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload
+
# tokenizer_class removed
class WhisperProcessor(ProcessorMixin):
r"""
@@ -38,7 +39,7 @@ def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
def get_T_after_cnn(self, L_in, dilation=1):
- for (padding, kernel_size, stride) in eval("[(1,3,1)] + [(1,3,2)] "):
+ for padding, kernel_size, stride in eval("[(1,3,1)] + [(1,3,2)] "):
L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
L_out = 1 + L_out // stride
L_in = L_out
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 94065cfc3a..c9eb4de543 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -164,7 +164,6 @@ def _prepare_multimodal_resource_inputs(
return items, md5sums, tokens_nums, datas
async def _alloc_resource(self, items, md5sums, token_nums, datas):
-
while True:
records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
@@ -489,7 +488,6 @@ def _count_multimodal_tokens(self, multimodal_params: MultimodalParams) -> Tuple
return image_tokens, audio_tokens
async def _log_req_header(self, request_headers, group_request_id: int):
-
x_request_id = request_headers.get("X-Request-Id", "")
x_session_id = request_headers.get("X-Session-Id", "")
@@ -622,7 +620,6 @@ async def transfer_to_next_module(
self,
group_req_objs: Optional[GroupReqObjs] = None,
):
-
if self.pd_mode.is_P_or_NORMAL():
if not self.args.disable_vision:
logger.debug(
@@ -689,7 +686,6 @@ async def _wait_to_token_package(
req_status: "ReqStatus",
request: Request,
):
-
event = req_status.event
unfinished_count = sampling_params.best_of
out_token_counter = 0
@@ -820,7 +816,6 @@ async def recycle_resource_loop(self):
pre_time_mark = time.time()
while True:
-
try:
await asyncio.wait_for(self.recycle_event.wait(), timeout=0.02)
except asyncio.TimeoutError:
@@ -897,7 +892,6 @@ async def handle_loop(self):
for _ in range(read_token_count):
if not req.out_tokens_queue.is_empty():
-
text, src_index, special, count_output_tokens = req.out_tokens_queue.peek()
req.cumlogprob += float(req.shm_logprobs.arr[src_index])
metadata = {
From ab788d9c41e2311f7cc1f5c41ea2bd2ec849d6db Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Fri, 3 Apr 2026 04:59:16 +0000
Subject: [PATCH 19/51] chore: improve qwen3 omni audio formatting
---
lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index c08dd68a2f..c14df5ff9d 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -101,7 +101,7 @@ def __init__(self, d_model, encoder_attention_heads, attention_dropout):
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {self.num_heads})."
)
- self.scaling = self.head_dim**-0.5
+ self.scaling = self.head_dim ** -0.5
self.attention_dropout = 0.0
self.is_decoder = False
self.is_causal = False
From 0570b965a732e2f5a0655164c82844d150a35bdb Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Fri, 3 Apr 2026 05:43:13 +0000
Subject: [PATCH 20/51] =?UTF-8?q?fix=C3=A2=C2=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
lightllm/server/audioserver/model_infer/model_rpc.py | 12 ++++++++++++
lightllm/server/core/objs/start_args_type.py | 2 --
2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py
index 76d5787b48..343816e1fd 100644
--- a/lightllm/server/audioserver/model_infer/model_rpc.py
+++ b/lightllm/server/audioserver/model_infer/model_rpc.py
@@ -158,6 +158,10 @@ def _get_audio_items_from_infer_queue(self, max_num: int, force_same: bool = Fal
return tasks
def _get_audio_items_from_store_queue(self, max_num: int) -> List[AudioItem]:
+ """
+ 与 visual 的 _get_image_items_from_store_queue 一致:store 队列中单条为 AudioItem,
+ 按批取出至多 max_num 条。
+ """
tasks = []
task = self.store_queue.get(block=True)
tasks.append(task)
@@ -172,6 +176,9 @@ def _get_audio_items_from_store_queue(self, max_num: int) -> List[AudioItem]:
return tasks
def _infer_worker(self):
+ """
+ 与 visual _infer_worker 一致:推理后对每个 item 单独放入 store_queue,由 store 线程批处理再 commit。
+ """
torch.cuda.set_device(self.device_id)
while True:
try:
@@ -190,6 +197,7 @@ def _infer_worker(self):
self._save_to_cpu_cache(all_embeds=all_embeds, audios=audios)
+ # 与 visual _store_to_cpu_cache 相同条入队,便于 store 侧按 infer_max_batch_size 聚合
for audio in audios:
self.store_queue.put(audio)
@@ -208,6 +216,7 @@ def _save_to_cpu_cache(self, all_embeds: List[torch.Tensor], audios: List[AudioI
return
def _commit_to_cpu_cache(self, audios: List[AudioItem]):
+ # 与 visual _commit_to_cpu_cache:仅 tp0 通知完成;embed 已在 model.encode 内写入 cache
if self.tp_rank_id == 0:
for audio in audios:
audio.cuda_event.synchronize()
@@ -221,6 +230,9 @@ def _commit_to_cpu_cache(self, audios: List[AudioItem]):
self._log_latency(audios[0], "set_items_embed")
def _store_worker(self):
+ """
+ 与 visual _store_worker 一致:从 store 队列按批取 AudioItem,再 commit 并释放信号量。
+ """
while True:
try:
audios: List[AudioItem] = self._get_audio_items_from_store_queue(max_num=self.infer_max_batch_size)
diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
index a920a09710..ac9bd9e180 100644
--- a/lightllm/server/core/objs/start_args_type.py
+++ b/lightllm/server/core/objs/start_args_type.py
@@ -107,12 +107,10 @@ class StartArgs:
push_interval: int = field(default=10)
visual_node_id: int = field(default=None)
visual_infer_batch_size: int = field(default=None)
- audio_infer_batch_size: int = field(default=None)
visual_send_batch_size: int = field(default=1)
visual_gpu_ids: List[int] = field(default_factory=lambda: [0])
visual_tp: int = field(default=1)
visual_dp: int = field(default=1)
- audio_dp: int = field(default=1)
visual_nccl_ports: List[int] = field(default=None)
visual_rpyc_port: Optional[int] = field(default=None)
audio_gpu_ids: Optional[List[int]] = field(default=None)
From 70aad721087731a2253a7b88a631a9994b53f3c5 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Fri, 3 Apr 2026 06:36:43 +0000
Subject: [PATCH 21/51] fix
---
.../qwen3next/triton_kernel/causal_conv1d.py | 20 +++++++++++++++++--
lightllm/server/api_cli.py | 4 ----
2 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
index c6d099a2d8..3371aca71a 100644
--- a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
+++ b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
@@ -4,8 +4,20 @@
import torch
-from sgl_kernel import causal_conv1d_fwd
-from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+try:
+ from sgl_kernel import causal_conv1d_fwd
+ from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
+except ImportError:
+ causal_conv1d_fwd = None
+ causal_conv1d_update_kernel = None
+ logger.warning(
+ "sgl_kernel is not installed, qwen3next/qwen3.5 causal_conv1d kernels are unavailable. "
+ "Install `sgl_kernel` before serving those models."
+ )
def causal_conv1d_fn(
@@ -51,6 +63,8 @@ def causal_conv1d_fn(
"""
if activation not in [None, "silu", "swish"]:
raise NotImplementedError("activation must be None, silu, or swish")
+ if causal_conv1d_fwd is None:
+ raise ImportError("sgl_kernel is required for qwen3next/qwen3.5 causal_conv1d_fn")
if x.stride(-1) != 1:
x = x.contiguous()
bias = bias.contiguous() if bias is not None else None
@@ -103,6 +117,8 @@ def causal_conv1d_update(
"""
if activation not in [None, "silu", "swish"]:
raise NotImplementedError(f"activation must be None, silu, or swish, actual: {activation}")
+ if causal_conv1d_update_kernel is None:
+ raise ImportError("sgl_kernel is required for qwen3next/qwen3.5 causal_conv1d_update")
activation_val = activation in ["silu", "swish"]
unsqueeze = x.dim() == 2
if unsqueeze:
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
index 01bf4d306b..89aeeec833 100644
--- a/lightllm/server/api_cli.py
+++ b/lightllm/server/api_cli.py
@@ -460,9 +460,6 @@ def make_argument_parser() -> argparse.ArgumentParser:
parser.add_argument(
"--visual_infer_batch_size", type=int, default=None, help="number of images to process in each inference batch"
)
- parser.add_argument(
- "--audio_infer_batch_size", type=int, default=None, help="number of audios to process in each inference batch"
- )
parser.add_argument(
"--visual_send_batch_size",
type=int,
@@ -477,7 +474,6 @@ def make_argument_parser() -> argparse.ArgumentParser:
)
parser.add_argument("--visual_tp", type=int, default=1, help="number of tensort parallel instances for ViT")
parser.add_argument("--visual_dp", type=int, default=1, help="number of data parallel instances for ViT")
- parser.add_argument("--audio_dp", type=int, default=1, help="number of data parallel instances for audio encoder")
parser.add_argument(
"--visual_nccl_ports",
nargs="+",
From 86a16f708d42395e8692022ae28a8805fbcb1b27 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 02:50:53 +0000
Subject: [PATCH 22/51] fix md5 and
---
lightllm/server/httpserver/manager.py | 79 +++++++++++----------------
1 file changed, 31 insertions(+), 48 deletions(-)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index c9eb4de543..e9843c8237 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -128,41 +128,9 @@ def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str
cost_ms = (time.time() - start_time) * 1000.0
extras = " ".join(f"{k}:{v}" for k, v in kwargs.items())
suffix = f" {extras}" if extras else ""
- logger.info(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}")
+ logger.debug(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}")
return
- def _prepare_multimodal_resource_inputs(
- self, multimodal_params: MultimodalParams, sampling_params: SamplingParams
- ) -> Tuple[List[Union[ImageItem, AudioItem]], List[str], List[int], List[bytes]]:
- items, md5sums, tokens_nums, datas = [], [], [], []
-
- for img in multimodal_params.images:
- self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
- data = img.read()
- token_num = self.tokenizer.get_image_token_length(img)
- md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
- img.md5 = md5sum
- md5sums.append(md5sum)
- tokens_nums.append(token_num)
- datas.append(data)
- items.append(img)
-
- for audio in multimodal_params.audios:
- self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
- data = audio.read()
- token_num = self.tokenizer.get_audio_token_length(audio)
- payload_md5 = audio.extra_params.get("audio_payload_md5")
- if payload_md5 is None:
- payload_md5 = hashlib.md5(data).hexdigest()
- md5sum = payload_md5 + "_" + str(hash(frozendict(audio.extra_params)))
- audio.md5 = md5sum
- md5sums.append(md5sum)
- tokens_nums.append(token_num)
- datas.append(data)
- items.append(audio)
-
- return items, md5sums, tokens_nums, datas
-
async def _alloc_resource(self, items, md5sums, token_nums, datas):
while True:
records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
@@ -197,6 +165,29 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
items, md5sums, tokens_nums, datas = self._prepare_multimodal_resource_inputs(
multimodal_params, sampling_params
)
+ items, md5sums, tokens_nums, datas = [], [], [], []
+ for img in multimodal_params.images:
+ self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
+ data = img.read()
+ token_num = self.tokenizer.get_image_token_length(img)
+ md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
+ img.md5 = md5sum
+ md5sums.append(md5sum)
+ tokens_nums.append(token_num)
+ datas.append(data)
+ items.append(img)
+ for audio in multimodal_params.audios:
+ self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
+ data = audio.read()
+ token_num = self.tokenizer.get_audio_token_length(audio)
+ payload_md5 = audio.extra_params.get("audio_payload_md5")
+ md5sum = payload_md5
+ audio.md5 = md5sum
+ md5sums.append(md5sum)
+ tokens_nums.append(token_num)
+ datas.append(data)
+ items.append(audio)
+
if len(items) <= 1:
await self._alloc_resource(items, md5sums, tokens_nums, datas)
return
@@ -515,21 +506,13 @@ async def _encode(
assert not self.args.disable_audio, "audio multimodal not enabled"
await self._alloc_multimodal_resources(multimodal_params, sampling_params)
log_req_id = getattr(sampling_params, "group_request_id", None)
- if start_time is None:
- logger.info(
- f"lightllm_req_id:{log_req_id} "
- f"stage:alloc_multimodal_resources_done "
- f"elapsed_ms:0.000 "
- f"audio_count:{len(multimodal_params.audios)} image_count:{len(multimodal_params.images)}"
- )
- else:
- self._log_stage_timing(
- log_req_id,
- start_time,
- "alloc_multimodal_resources_done",
- audio_count=len(multimodal_params.audios),
- image_count=len(multimodal_params.images),
- )
+ self._log_stage_timing(
+ log_req_id,
+ start_time,
+ "alloc_multimodal_resources_done",
+ audio_count=len(multimodal_params.audios),
+ image_count=len(multimodal_params.images),
+ )
prompt_ids = self.tokenizer.encode(
prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens
)
From 46016378357ff382ecc492405bd9c3cdfc4ee6c9 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 02:51:43 +0000
Subject: [PATCH 23/51] fix md5
---
lightllm/server/multimodal_params.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index ce20e5d657..0aac1874c8 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -13,6 +13,8 @@
from fastapi import Request
from lightllm.utils.multimodal_utils import fetch_resource
from lightllm.utils.log_utils import init_logger
+from frozendict import frozendict
+
logger = init_logger(__name__)
RAW_AUDIO_SHM_FORMAT = "raw_audio_bytes"
@@ -118,7 +120,7 @@ async def preload(self, request: Request, audio_preload_config: dict = None):
self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
self.extra_params["audio_num_frames"] = int(effective_audio_len // hop_length)
self._preload_data = audio_values.tobytes()
- self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest()
+ self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params)))
logger.info(
f"lightllm_req_id:{req_id} stage:audio_preload_done "
f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} "
From 16203e4510f89d23acca2d81d4862975eed82d4c Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 03:00:13 +0000
Subject: [PATCH 24/51] format
---
lightllm/server/multimodal_params.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 0aac1874c8..e62e73fade 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -120,7 +120,9 @@ async def preload(self, request: Request, audio_preload_config: dict = None):
self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
self.extra_params["audio_num_frames"] = int(effective_audio_len // hop_length)
self._preload_data = audio_values.tobytes()
- self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params)))
+ self.extra_params["audio_payload_md5"] = (
+ hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params)))
+ )
logger.info(
f"lightllm_req_id:{req_id} stage:audio_preload_done "
f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} "
From 93421d28662a45bc8ac86e658561a33a612612ef Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 03:13:51 +0000
Subject: [PATCH 25/51] using asyncio.to_thread preventing the server from
handling other concurrent requests
---
lightllm/server/multimodal_params.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index e62e73fade..5847975878 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -94,7 +94,7 @@ async def preload(self, request: Request, audio_preload_config: dict = None):
# check if valid audio bytes
decode_start = time.time()
- audio_values, _ = librosa.load(BytesIO(audio_data), sr=target_sample_rate)
+ audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=target_sample_rate)
audio_values = np.asarray(audio_values, dtype=np.float32)
decode_cost_ms = (time.time() - decode_start) * 1000.0
effective_audio_len = max(audio_values.shape[0], min_audio_len)
From f7b05898d0948404d685ab5094ed4c1aab2bd27e Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 06:06:25 +0000
Subject: [PATCH 26/51] fix
---
lightllm/server/httpserver/manager.py | 3 ---
1 file changed, 3 deletions(-)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index e9843c8237..0e4a9b79eb 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -162,9 +162,6 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas):
async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams):
# 只有 P 和 NORMAL 节点需要真的管理多模态资源
if self.pd_mode.is_P_or_NORMAL():
- items, md5sums, tokens_nums, datas = self._prepare_multimodal_resource_inputs(
- multimodal_params, sampling_params
- )
items, md5sums, tokens_nums, datas = [], [], [], []
for img in multimodal_params.images:
self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
From 0ea215605bec31a05c33b8b9b6ea1832a8ac6464 Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Tue, 7 Apr 2026 08:52:11 +0000
Subject: [PATCH 27/51] fix
---
lightllm/utils/multimodal_utils.py | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/lightllm/utils/multimodal_utils.py b/lightllm/utils/multimodal_utils.py
index 5f4fd18516..4b49ea8891 100644
--- a/lightllm/utils/multimodal_utils.py
+++ b/lightllm/utils/multimodal_utils.py
@@ -4,14 +4,11 @@
import httpx
from PIL import Image
from io import BytesIO
-from urllib.parse import urlparse
-from typing import Dict, Optional
from fastapi import Request
+from functools import lru_cache
from lightllm.utils.log_utils import init_logger
logger = init_logger(__name__)
-_HTTP_CLIENTS: Dict[Optional[str], httpx.AsyncClient] = {}
-_LOCAL_RESOURCE_HOSTS = {"127.0.0.1", "localhost", "::1"}
def _httpx_async_client_proxy_kwargs(proxy) -> dict:
@@ -39,15 +36,17 @@ def image2base64(img_str: str):
return base64.b64encode(buffer.getvalue()).decode("utf-8")
+@lru_cache(maxsize=256)
+def _get_xhttp_client(proxy=None):
+ kvargs = _httpx_async_client_proxy_kwargs(proxy)
+ kvargs["limits"] = httpx.Limits(max_connections=10000, max_keepalive_connections=20)
+ return httpx.AsyncClient(**kvargs)
+
+
async def fetch_resource(url, request: Request, timeout, proxy=None):
logger.info(f"Begin to download resource from url: {url}")
start_time = time.time()
- hostname = urlparse(url).hostname
- effective_proxy = None if hostname in _LOCAL_RESOURCE_HOSTS else proxy
- client = _HTTP_CLIENTS.get(effective_proxy)
- if client is None:
- client = httpx.AsyncClient(**_httpx_async_client_proxy_kwargs(effective_proxy))
- _HTTP_CLIENTS[effective_proxy] = client
+ client = _get_xhttp_client(proxy)
async with client.stream("GET", url, timeout=timeout) as response:
response.raise_for_status()
ans_bytes = []
From 6856540018aff2b4614b64e1af88374da721ac84 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 13:20:21 +0000
Subject: [PATCH 28/51] fix
---
.../qwen3next/triton_kernel/causal_conv1d.py | 23 ++++---------------
1 file changed, 4 insertions(+), 19 deletions(-)
diff --git a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
index 3371aca71a..2bf325340f 100644
--- a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
+++ b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py
@@ -4,21 +4,6 @@
import torch
-from lightllm.utils.log_utils import init_logger
-
-logger = init_logger(__name__)
-
-try:
- from sgl_kernel import causal_conv1d_fwd
- from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
-except ImportError:
- causal_conv1d_fwd = None
- causal_conv1d_update_kernel = None
- logger.warning(
- "sgl_kernel is not installed, qwen3next/qwen3.5 causal_conv1d kernels are unavailable. "
- "Install `sgl_kernel` before serving those models."
- )
-
def causal_conv1d_fn(
x: torch.Tensor,
@@ -63,8 +48,8 @@ def causal_conv1d_fn(
"""
if activation not in [None, "silu", "swish"]:
raise NotImplementedError("activation must be None, silu, or swish")
- if causal_conv1d_fwd is None:
- raise ImportError("sgl_kernel is required for qwen3next/qwen3.5 causal_conv1d_fn")
+ from sgl_kernel import causal_conv1d_fwd
+
if x.stride(-1) != 1:
x = x.contiguous()
bias = bias.contiguous() if bias is not None else None
@@ -117,8 +102,8 @@ def causal_conv1d_update(
"""
if activation not in [None, "silu", "swish"]:
raise NotImplementedError(f"activation must be None, silu, or swish, actual: {activation}")
- if causal_conv1d_update_kernel is None:
- raise ImportError("sgl_kernel is required for qwen3next/qwen3.5 causal_conv1d_update")
+ from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
+
activation_val = activation in ["silu", "swish"]
unsqueeze = x.dim() == 2
if unsqueeze:
From 9d0671b7ba3b01c995f8a4e4fefef7fb94d80f8d Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 13:26:37 +0000
Subject: [PATCH 29/51] use details_log to log
---
lightllm/server/httpserver/manager.py | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 0e4a9b79eb..e2a0dbc4b6 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -125,10 +125,11 @@ def __init__(
return
def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str, **kwargs):
- cost_ms = (time.time() - start_time) * 1000.0
- extras = " ".join(f"{k}:{v}" for k, v in kwargs.items())
- suffix = f" {extras}" if extras else ""
- logger.debug(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}")
+ if self.args.detail_log:
+ cost_ms = (time.time() - start_time) * 1000.0
+ extras = " ".join(f"{k}:{v}" for k, v in kwargs.items())
+ suffix = f" {extras}" if extras else ""
+ logger.debug(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}")
return
async def _alloc_resource(self, items, md5sums, token_nums, datas):
From 8e21207325fd8205cedebf6c9f30efa60a152bbb Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 13:40:33 +0000
Subject: [PATCH 30/51] delete warmup
---
.../qwen3_omni_audio.py | 24 ---------------
lightllm/models/whisper/whisper_audio.py | 20 -------------
lightllm/server/api_http.py | 7 +----
.../audioserver/model_infer/model_rpc.py | 30 -------------------
lightllm/server/multimodal_params.py | 22 --------------
5 files changed, 1 insertion(+), 102 deletions(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index c14df5ff9d..7d525915af 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -2,9 +2,7 @@
import json
import math
import torch
-import librosa
import numpy as np
-from io import BytesIO
from torch import Tensor, nn
from safetensors import safe_open
from torch.nn import functional as F
@@ -16,10 +14,6 @@
from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor
-from lightllm.utils.log_utils import init_logger
-
-
-logger = init_logger(__name__)
def _get_feat_extract_output_lengths(input_lengths):
@@ -376,21 +370,3 @@ def encode(self, audio_items: List[AudioItem]):
all_embeds.append(cur_embed)
return all_embeds, audio_items
-
- @torch.no_grad()
- def warmup(self, audio_bytes: bytes):
- audio = BytesIO(audio_bytes)
- audio, _ = librosa.load(audio, sr=self.processor.sampling_rate)
- num_frames = max(audio.shape[0], 480) // self.processor.hop_length
- padded_len = ((max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length) * (
- self.processor.hop_length
- )
- if padded_len > audio.shape[0]:
- audio = np.pad(audio, (0, padded_len - audio.shape[0]), mode="constant", constant_values=0.0)
- input_features, feature_lens = self.processor._preprocess_single_padded(audio, num_frames, device="cpu")
- _ = self.forward(
- input_features,
- feature_lens=feature_lens,
- )
- torch.cuda.current_stream().synchronize()
- return
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
index 750bf8e158..4cd9619e55 100644
--- a/lightllm/models/whisper/whisper_audio.py
+++ b/lightllm/models/whisper/whisper_audio.py
@@ -1,10 +1,8 @@
import os
import json
-import librosa
import numpy as np
import torch
import torch.nn.functional as F
-from io import BytesIO
from typing import List, Union
from safetensors.torch import load_file
from transformers.processing_utils import ProcessorMixin
@@ -225,21 +223,3 @@ def encode(self, audio_items: List[AudioItem]):
ans_embeds.append(cur_embed)
return ans_embeds, audio_items
-
- @torch.no_grad()
- def warmup(self, audio_bytes: bytes):
- audio = BytesIO(audio_bytes)
- audio, _ = librosa.load(audio, sr=16000)
-
- from .defaults import MIN_AUDIO_LEN
-
- if audio.shape[0] < MIN_AUDIO_LEN:
- audio = np.pad(audio, (0, MIN_AUDIO_LEN - len(audio)), mode="constant", constant_values=0.0)
-
- batch_audio_lens = np.array([min(audio.shape[0], self.max_length)], dtype=np.int32)
- audios, audio_lens_after_cnn = self.audio_processor(
- [audio], batch_audio_lens, sampling_rate=16000, return_tensors="pt"
- )
- _ = self.forward(audios, audio_lens_after_cnn)
- torch.cuda.current_stream().synchronize()
- return
diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py
index 1322168e38..40d20bcd27 100755
--- a/lightllm/server/api_http.py
+++ b/lightllm/server/api_http.py
@@ -41,7 +41,7 @@
from fastapi.responses import Response, StreamingResponse, JSONResponse
from lightllm.server.core.objs.sampling_params import SamplingParams
from lightllm.server.core.objs import StartArgs
-from .multimodal_params import MultimodalParams, warmup_audio_preload
+from .multimodal_params import MultimodalParams
from .httpserver.manager import HttpServerManager
from .httpserver_for_pd_master.manager import HttpServerManagerForPDMaster
from .api_lightllm import lightllm_get_score
@@ -389,11 +389,6 @@ async def startup_event():
logger.info("server start up")
loop = asyncio.get_event_loop()
g_objs.set_args(get_env_start_args())
- if g_objs.args.enable_multimodal and not g_objs.args.disable_audio:
- warmup_start = time.time()
- logger.info("http_audio_preload_warmup_start")
- await warmup_audio_preload()
- logger.info(f"http_audio_preload_warmup_done elapsed_ms:{(time.time() - warmup_start) * 1000.0:.3f}")
loop.create_task(g_objs.httpserver_manager.handle_loop())
logger.info(f"server start up ok, loop use is {asyncio.get_event_loop()}")
return
diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py
index 343816e1fd..8a04231508 100644
--- a/lightllm/server/audioserver/model_infer/model_rpc.py
+++ b/lightllm/server/audioserver/model_infer/model_rpc.py
@@ -1,10 +1,6 @@
-import io
import queue
import threading
import time
-import wave
-
-import numpy as np
import rpyc
import socket
import torch
@@ -25,17 +21,6 @@
logger = init_logger(__name__)
-def _generate_silence_wav_bytes(sample_rate: int = 16000, num_samples: int = 16000) -> bytes:
- samples = np.zeros(num_samples, dtype=np.int16)
- buffer = io.BytesIO()
- with wave.open(buffer, "wb") as wav_file:
- wav_file.setnchannels(1)
- wav_file.setsampwidth(2)
- wav_file.setframerate(sample_rate)
- wav_file.writeframes(samples.tobytes())
- return buffer.getvalue()
-
-
class AudioModelRpcServer(rpyc.Service):
def exposed_init_model(self, kvargs):
kvargs = obtain(kvargs)
@@ -74,7 +59,6 @@ def exposed_init_model(self, kvargs):
create_meta_data=False,
init_shm_data=False,
)
- self._auto_warmup_model()
self._init_taskes()
except Exception as e:
print("#" * 16)
@@ -87,20 +71,6 @@ def exposed_init_model(self, kvargs):
set_random_seed(2147483647)
return
- def _auto_warmup_model(self):
- if not hasattr(self.model, "warmup"):
- return
- try:
- torch.cuda.set_device(self.device_id)
- warmup_audio = _generate_silence_wav_bytes()
- self.model.warmup(warmup_audio)
- logger.info(
- f"audio model auto warmup finished on dp_rank_id:{self.dp_rank_id} tp_rank_id:{self.tp_rank_id}"
- )
- except Exception as e:
- logger.exception(f"audio model auto warmup failed: {e}")
- raise
-
def exposed_run_task(self, audios: List[AudioItem], ref_event_list: List[threading.Event]):
try:
audios = obtain(audios)
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 5847975878..79ef2fe028 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -1,7 +1,6 @@
"""Multimodal parameters for text generation."""
import asyncio
import os
-import wave
import time
import librosa
import base64
@@ -25,17 +24,6 @@
DEFAULT_MIN_AUDIO_LEN = 480
-def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float = 1.0) -> bytes:
- num_samples = max(1, int(sample_rate * duration_seconds))
- with BytesIO() as buffer:
- with wave.open(buffer, "wb") as wav_file:
- wav_file.setnchannels(1)
- wav_file.setsampwidth(2)
- wav_file.setframerate(sample_rate)
- wav_file.writeframes(b"\x00\x00" * num_samples)
- return buffer.getvalue()
-
-
def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray:
audio_shm_format = extra_params.get("audio_shm_format", RAW_AUDIO_SHM_FORMAT)
if audio_shm_format == WAVEFORM_F32_SHM_FORMAT:
@@ -273,13 +261,3 @@ def to_origin_dict(self):
ret["images"] = [i.to_origin_dict() for i in self.images]
ret["audios"] = [a.to_origin_dict() for a in self.audios]
return ret
-
-
-async def warmup_audio_preload():
- warmup_audio = AudioItem(
- type="base64",
- data=base64.b64encode(generate_silence_wav_bytes()).decode("utf-8"),
- )
- await warmup_audio.preload(None)
- warmup_audio.read()
- return
From fe39faa1b994802083fa7acd3539a5371eebcbad Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 14:21:01 +0000
Subject: [PATCH 31/51] delete audio_preload_config
---
.../common/basemodel/multimodal_tokenizer.py | 1 -
lightllm/models/internvl/model.py | 5 --
.../models/qwen3_omni_moe_thinker/model.py | 5 --
.../qwen3_omni_audio.py | 26 +++----
lightllm/server/api_http.py | 4 +-
.../audioserver/model_infer/model_rpc.py | 1 -
lightllm/server/httpserver/manager.py | 4 +-
lightllm/server/multimodal_params.py | 76 +++----------------
8 files changed, 24 insertions(+), 98 deletions(-)
diff --git a/lightllm/common/basemodel/multimodal_tokenizer.py b/lightllm/common/basemodel/multimodal_tokenizer.py
index 872a418bf7..cdcbd7f089 100644
--- a/lightllm/common/basemodel/multimodal_tokenizer.py
+++ b/lightllm/common/basemodel/multimodal_tokenizer.py
@@ -33,7 +33,6 @@
class BaseMultiModalTokenizer(ABC):
def __init__(self, tokenizer, **kwargs):
self.tokenizer = tokenizer
- self.audio_preload_config = None
def __getattr__(self, name):
obj_dict = object.__getattribute__(self, "__dict__")
diff --git a/lightllm/models/internvl/model.py b/lightllm/models/internvl/model.py
index 70c797aeb8..ccb76d3512 100644
--- a/lightllm/models/internvl/model.py
+++ b/lightllm/models/internvl/model.py
@@ -50,11 +50,6 @@ def __init__(self, tokenizer, model_cfg, **kwargs):
self.audio_min_length = MIN_AUDIO_LEN
self.audio_max_length = 16000 * 30
- self.audio_preload_config = {
- "sampling_rate": 16000,
- "hop_length": 160,
- "min_audio_len": int(self.audio_min_length),
- }
def init_imageitem_extral_params(
self, img: ImageItem, multi_params: MultimodalParams, sampling_params: SamplingParams
diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py
index 6ae73fd1d1..79ce939714 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/model.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/model.py
@@ -45,11 +45,6 @@ def __init__(self, tokenizer=None, processor=None, **kwargs):
self.sampling_rate = self.audio_processor.sampling_rate
self.n_samples = self.audio_processor.n_samples
self.hop_length = self.audio_processor.hop_length
- self.audio_preload_config = {
- "sampling_rate": int(self.sampling_rate),
- "hop_length": int(self.hop_length),
- "min_audio_len": int(MIN_AUDIO_LEN),
- }
self.image_start_id = kwargs["model_cfg"]["vision_start_token_id"]
self.image_end_id = kwargs["model_cfg"]["vision_end_token_id"]
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index 7d525915af..71fdb3f3b1 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -9,7 +9,7 @@
from typing import Callable, Optional, Union, List
from transformers.activations import ACT2FN
-from lightllm.server.multimodal_params import AudioItem, WAVEFORM_F32_SHM_FORMAT, load_audio_from_shm_payload
+from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload
from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
@@ -338,25 +338,19 @@ def encode(self, audio_items: List[AudioItem]):
items.append(item)
audio_data = read_shm(get_shm_name_data(item.uuid))
audio = load_audio_from_shm_payload(audio_data, item.extra_params, self.processor.sampling_rate)
- audio_num_frames = item.extra_params.get("audio_num_frames")
else:
raise ValueError(f"cannot read audio which type is {type(item)}!")
- if audio_num_frames is not None and item.extra_params.get("audio_shm_format") == WAVEFORM_F32_SHM_FORMAT:
- input_features, feature_lens = self.processor._preprocess_single_padded(
- audio, int(audio_num_frames), device="cpu"
- )
+ input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
+ if feature_attention_mask is not None:
+ audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
+ input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
else:
- input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True)
- if feature_attention_mask is not None:
- audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
- input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
- else:
- audio_feature_lengths = None
-
- feature_lens = (
- audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
- )
+ audio_feature_lengths = None
+
+ feature_lens = (
+ audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1)
+ )
audio_features = self.forward(
input_features,
diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py
index 40d20bcd27..50d992bf9c 100755
--- a/lightllm/server/api_http.py
+++ b/lightllm/server/api_http.py
@@ -300,9 +300,7 @@ async def tokens(request: Request):
multimodal_params_dict = request_dict.get("multimodal_params", {})
multimodal_params = MultimodalParams(**multimodal_params_dict)
- await multimodal_params.verify_and_preload(
- request, audio_preload_config=getattr(g_objs.httpserver_manager.tokenizer, "audio_preload_config", None)
- )
+ await multimodal_params.verify_and_preload(request)
return JSONResponse(
{
"ntokens": g_objs.httpserver_manager.tokens(
diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py
index 8a04231508..39a7e06ac3 100644
--- a/lightllm/server/audioserver/model_infer/model_rpc.py
+++ b/lightllm/server/audioserver/model_infer/model_rpc.py
@@ -8,7 +8,6 @@
from typing import List
from transformers.configuration_utils import PretrainedConfig
from rpyc.utils.classic import obtain
-
from lightllm.models.whisper.whisper_audio import WhisperAudioModel
from lightllm.models.qwen3_omni_moe_thinker.qwen3_omni_audio import Qwen3OmniMoeAudioEncoder
from lightllm.server.multimodal_params import AudioItem
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index e2a0dbc4b6..acfe04850f 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -311,9 +311,7 @@ async def generate(
original_multimodal_params = copy.deepcopy(multimodal_params)
if self.pd_mode.is_P_or_NORMAL():
- await multimodal_params.verify_and_preload(
- request, audio_preload_config=getattr(self.tokenizer, "audio_preload_config", None)
- )
+ await multimodal_params.verify_and_preload(request)
self._log_stage_timing(
group_request_id,
start_time,
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 79ef2fe028..f103e54ce5 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -16,28 +16,14 @@
logger = init_logger(__name__)
-RAW_AUDIO_SHM_FORMAT = "raw_audio_bytes"
-WAVEFORM_F32_SHM_FORMAT = "waveform_f32"
-AUDIO_SHM_USE_RAW_ENV = "LIGHTLLM_AUDIO_SHM_USE_RAW"
DEFAULT_AUDIO_SAMPLE_RATE = 16000
-DEFAULT_AUDIO_HOP_LENGTH = 160
-DEFAULT_MIN_AUDIO_LEN = 480
def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray:
- audio_shm_format = extra_params.get("audio_shm_format", RAW_AUDIO_SHM_FORMAT)
- if audio_shm_format == WAVEFORM_F32_SHM_FORMAT:
- num_samples = int(extra_params.get("audio_num_samples", 0))
- if num_samples > 0:
- return np.frombuffer(audio_data, dtype=np.float32, count=num_samples)
- return np.frombuffer(audio_data, dtype=np.float32)
-
- audio, _ = librosa.load(BytesIO(audio_data), sr=sample_rate)
- return np.asarray(audio, dtype=np.float32)
-
-
-def should_use_raw_audio_shm() -> bool:
- return os.getenv(AUDIO_SHM_USE_RAW_ENV, "0") == "1"
+ num_samples = int(extra_params.get("audio_num_samples", 0))
+ if num_samples > 0:
+ return np.frombuffer(audio_data, dtype=np.float32, count=num_samples)
+ return np.frombuffer(audio_data, dtype=np.float32)
class AudioItem:
@@ -60,11 +46,8 @@ def __init__(self, **kwargs):
self._preload_data = None
self.extra_params = {}
- async def preload(self, request: Request, audio_preload_config: dict = None):
+ async def preload(self, request: Request):
try:
- req_id = getattr(getattr(request, "state", None), "lightllm_req_id", None)
- preload_start = time.time()
- source_ready_start = preload_start
if self._type == "url":
timeout = int(os.getenv("REQUEST_TIMEOUT", "5"))
proxy = os.getenv("REQUEST_PROXY", None)
@@ -73,51 +56,18 @@ async def preload(self, request: Request, audio_preload_config: dict = None):
audio_data = base64.b64decode(self._data)
else:
raise ValueError(f"cannot read audio which type is {self._type}!")
- source_ready_cost_ms = (time.time() - source_ready_start) * 1000.0
-
- audio_preload_config = audio_preload_config or {}
- target_sample_rate = int(audio_preload_config.get("sampling_rate", DEFAULT_AUDIO_SAMPLE_RATE))
- hop_length = int(audio_preload_config.get("hop_length", DEFAULT_AUDIO_HOP_LENGTH))
- min_audio_len = int(audio_preload_config.get("min_audio_len", DEFAULT_MIN_AUDIO_LEN))
# check if valid audio bytes
- decode_start = time.time()
- audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=target_sample_rate)
+ audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=DEFAULT_AUDIO_SAMPLE_RATE)
audio_values = np.asarray(audio_values, dtype=np.float32)
- decode_cost_ms = (time.time() - decode_start) * 1000.0
- effective_audio_len = max(audio_values.shape[0], min_audio_len)
- padded_audio_len = ((effective_audio_len + hop_length - 1) // hop_length) * hop_length
- if padded_audio_len > audio_values.shape[0]:
- audio_values = np.pad(
- audio_values,
- (0, padded_audio_len - audio_values.shape[0]),
- mode="constant",
- constant_values=0.0,
- )
+ from lightllm.models.whisper.defaults import MIN_AUDIO_LEN
- self.audio_length = effective_audio_len
- if should_use_raw_audio_shm():
- self.extra_params["audio_shm_format"] = RAW_AUDIO_SHM_FORMAT
- self.extra_params.pop("audio_sample_rate", None)
- self.extra_params.pop("audio_num_samples", None)
- self.extra_params.pop("audio_num_frames", None)
- self._preload_data = audio_data
- else:
- self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT
- self.extra_params["audio_sample_rate"] = target_sample_rate
- self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
- self.extra_params["audio_num_frames"] = int(effective_audio_len // hop_length)
- self._preload_data = audio_values.tobytes()
+ self.audio_length = max(int(audio_values.shape[0]), MIN_AUDIO_LEN)
+ self._preload_data = audio_values.tobytes()
+ self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
self.extra_params["audio_payload_md5"] = (
hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params)))
)
- logger.info(
- f"lightllm_req_id:{req_id} stage:audio_preload_done "
- f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} "
- f"source_type:{self._type} source_ready_ms:{source_ready_cost_ms:.3f} "
- f"decode_ms:{decode_cost_ms:.3f} audio_length:{self.audio_length} "
- f"shm_format:{self.extra_params['audio_shm_format']}"
- )
return
except Exception as e:
@@ -238,11 +188,9 @@ def __init__(
self.audios = [AudioItem(**a) for a in audios]
return
- async def verify_and_preload(self, request: Request, audio_preload_config: dict = None):
+ async def verify_and_preload(self, request: Request):
preload_coroutines = [image.preload(request) for image in self.images]
- preload_coroutines.extend(
- audio.preload(request, audio_preload_config=audio_preload_config) for audio in self.audios
- )
+ preload_coroutines.extend(audio.preload(request) for audio in self.audios)
if preload_coroutines:
await asyncio.gather(*preload_coroutines)
return
From f1c9f0770a5e8452fbffe62c694f0ccfdbbf7d4c Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Tue, 7 Apr 2026 14:24:16 +0000
Subject: [PATCH 32/51] delete _preprocess_single_padded
---
.../qwen3_omni_moe_thinker/audio_process.py | 23 -------------------
1 file changed, 23 deletions(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
index 42eae8edb5..e9dc931886 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
@@ -105,29 +105,6 @@ def zero_mean_unit_var_norm(
return normed_input_values
- def _preprocess_single_padded(
- self,
- raw_speech: np.ndarray,
- num_frames: int,
- device: Optional[str] = "cpu",
- ) -> Tuple[torch.Tensor, torch.Tensor]:
- waveform = np.asarray(raw_speech, dtype=np.float32)
- if waveform.ndim != 1:
- raise ValueError(f"single audio fast path expects 1D waveform, got shape={waveform.shape}")
-
- extracted = self._torch_extract_fbank_features(waveform[None, :], device)
- extracted = np.asarray(extracted, dtype=np.float32)
- if extracted.ndim != 3:
- raise ValueError(f"unexpected extracted feature shape={extracted.shape}")
-
- if extracted.shape[-1] < num_frames:
- raise ValueError(f"feature frames {extracted.shape[-1]} < requested num_frames {num_frames}")
-
- compact_features = torch.from_numpy(extracted[:, :, :num_frames]).to(device="cuda", dtype=torch.bfloat16)
- compact_features = compact_features[0].contiguous()
- feature_lens = torch.tensor([num_frames], device="cuda", dtype=torch.long)
- return compact_features, feature_lens
-
def _preprocess(
self,
raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
From 9bee105b4a7e89e27ac11f783872793dcb643ed8 Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Wed, 8 Apr 2026 01:58:11 +0000
Subject: [PATCH 33/51] fix
---
lightllm/server/multimodal_params.py | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index f103e54ce5..2e8ed701e4 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -189,10 +189,11 @@ def __init__(
return
async def verify_and_preload(self, request: Request):
- preload_coroutines = [image.preload(request) for image in self.images]
- preload_coroutines.extend(audio.preload(request) for audio in self.audios)
- if preload_coroutines:
- await asyncio.gather(*preload_coroutines)
+ tasks = [image.preload(request) for image in self.images]
+ tasks += [audio.preload(request) for audio in self.audios]
+
+ if tasks:
+ await asyncio.gather(*tasks)
return
def to_dict(self):
From 6c9c49067cd6d1480685ae2636637b9aefe56cd2 Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Wed, 8 Apr 2026 02:51:45 +0000
Subject: [PATCH 34/51] fix
---
lightllm/server/httpserver/manager.py | 82 +++++++++++++++------------
1 file changed, 45 insertions(+), 37 deletions(-)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index acfe04850f..d5dcd37825 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -133,33 +133,48 @@ def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str
return
async def _alloc_resource(self, items, md5sums, token_nums, datas):
- while True:
- records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
-
- if records is None:
- await asyncio.sleep(0.1)
- continue
-
- if isinstance(records, str) and "error" in records:
- logger.error(str(records) + "and try to set --embed_cache_storage_size bigger")
- raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger")
-
- update_data_ids = []
- for item, rec, data in zip(items, records, datas):
- item: Union[ImageItem, AudioItem] = item
- item.uuid = rec["id"]
- item.token_id = rec["token_id"]
- item.token_num = rec["token_num"]
- item.start_index_in_embed_cache = rec["start_index_in_embed_cache"]
-
- if not rec["data_ready"]:
- create_shm(get_shm_name_data(rec["id"]), data)
- update_data_ids.append(rec["id"])
-
- if update_data_ids:
- self.cache_client.root.set_items_data(update_data_ids)
+ if len(items) == 0:
return
+ for _ in range(1000):
+ # 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity,从而造成死锁的问题。
+ # 如果不加任何锁,假如请求1和请求2都有6张图片,而cache_capacity为10,
+ # 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图,将会资源竞争产生死锁。
+ async with self._resource_lock:
+ records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
+ if records is not None:
+ break
+ await asyncio.sleep(0.01)
+
+ # 长时间无法申请到足够资源的时候,则开始进行阻塞式尝试,防止其他请求一起申请相关资源。
+ if records is None:
+ async with self._resource_lock:
+ while records is None:
+ records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
+ if records is not None:
+ break
+ await asyncio.sleep(0.1)
+
+ if isinstance(records, str) and "error" in records:
+ logger.error(str(records) + "and try to set --embed_cache_storage_size bigger")
+ raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger")
+
+ update_data_ids = []
+ for item, rec, data in zip(items, records, datas):
+ item: Union[ImageItem, AudioItem] = item
+ item.uuid = rec["id"]
+ item.token_id = rec["token_id"]
+ item.token_num = rec["token_num"]
+ item.start_index_in_embed_cache = rec["start_index_in_embed_cache"]
+
+ if not rec["data_ready"]:
+ create_shm(get_shm_name_data(rec["id"]), data)
+ update_data_ids.append(rec["id"])
+
+ if update_data_ids:
+ self.cache_client.root.set_items_data(update_data_ids)
+ return
+
async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams):
# 只有 P 和 NORMAL 节点需要真的管理多模态资源
if self.pd_mode.is_P_or_NORMAL():
@@ -167,10 +182,11 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
for img in multimodal_params.images:
self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
data = img.read()
+ # must after init_imageitem_extral_params
token_num = self.tokenizer.get_image_token_length(img)
md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
- img.md5 = md5sum
md5sums.append(md5sum)
+ img.md5 = md5sum
tokens_nums.append(token_num)
datas.append(data)
items.append(img)
@@ -178,22 +194,14 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params)
data = audio.read()
token_num = self.tokenizer.get_audio_token_length(audio)
- payload_md5 = audio.extra_params.get("audio_payload_md5")
- md5sum = payload_md5
- audio.md5 = md5sum
+ md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(audio.extra_params)))
md5sums.append(md5sum)
+ audio.md5 = md5sum
tokens_nums.append(token_num)
datas.append(data)
items.append(audio)
- if len(items) <= 1:
- await self._alloc_resource(items, md5sums, tokens_nums, datas)
- return
- # 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity,从而造成死锁的问题。
- # 如果不加任何锁,假如请求1和请求2都有6张图片,而cache_capacity为10,
- # 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图,将会资源竞争产生死锁。
- async with self._resource_lock:
- await self._alloc_resource(items, md5sums, tokens_nums, datas)
+ await self._alloc_resource(items, md5sums, tokens_nums, datas)
return
async def _release_multimodal_resources(self, multimodal_params: MultimodalParams):
From 3b057d0b6c450f167c6f2534e75d74a1c5801f0c Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Wed, 8 Apr 2026 02:55:00 +0000
Subject: [PATCH 35/51] fix
---
lightllm/server/httpserver/manager.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index d5dcd37825..115be4bd38 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -136,7 +136,7 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas):
if len(items) == 0:
return
- for _ in range(1000):
+ for _ in range(2000):
# 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity,从而造成死锁的问题。
# 如果不加任何锁,假如请求1和请求2都有6张图片,而cache_capacity为10,
# 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图,将会资源竞争产生死锁。
@@ -144,7 +144,7 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas):
records = obtain(self.cache_client.root.alloc(md5sums, token_nums))
if records is not None:
break
- await asyncio.sleep(0.01)
+ await asyncio.sleep(0.005)
# 长时间无法申请到足够资源的时候,则开始进行阻塞式尝试,防止其他请求一起申请相关资源。
if records is None:
From a8a8130932a90e1a51c4f94665357ed6127005a3 Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Wed, 8 Apr 2026 03:03:35 +0000
Subject: [PATCH 36/51] fix
---
lightllm/server/multimodal_params.py | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 2e8ed701e4..e45a28db12 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -1,10 +1,8 @@
"""Multimodal parameters for text generation."""
import asyncio
import os
-import time
import librosa
import base64
-import hashlib
import numpy as np
from typing import List
from io import BytesIO
@@ -12,11 +10,9 @@
from fastapi import Request
from lightllm.utils.multimodal_utils import fetch_resource
from lightllm.utils.log_utils import init_logger
-from frozendict import frozendict
logger = init_logger(__name__)
-DEFAULT_AUDIO_SAMPLE_RATE = 16000
def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray:
@@ -58,16 +54,19 @@ async def preload(self, request: Request):
raise ValueError(f"cannot read audio which type is {self._type}!")
# check if valid audio bytes
- audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=DEFAULT_AUDIO_SAMPLE_RATE)
+ audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=16000)
audio_values = np.asarray(audio_values, dtype=np.float32)
+
from lightllm.models.whisper.defaults import MIN_AUDIO_LEN
- self.audio_length = max(int(audio_values.shape[0]), MIN_AUDIO_LEN)
+ if audio_values.shape[0] < MIN_AUDIO_LEN:
+ audio_values = np.pad(
+ audio_values, (0, MIN_AUDIO_LEN - audio_values.shape[0]), mode="constant", constant_values=0.0
+ )
+ logger.warning(f"audio length is too short, pad to {MIN_AUDIO_LEN}")
+
+ self.audio_length = int(audio_values.shape[0])
self._preload_data = audio_values.tobytes()
- self.extra_params["audio_num_samples"] = int(audio_values.shape[0])
- self.extra_params["audio_payload_md5"] = (
- hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params)))
- )
return
except Exception as e:
From 4479a6599423cf3a442cb0a937ad89ab07dac8bc Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Wed, 8 Apr 2026 03:23:01 +0000
Subject: [PATCH 37/51] fix
---
.../qwen3_omni_moe_thinker/audio_process.py | 28 ++++++++-----------
1 file changed, 11 insertions(+), 17 deletions(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
index e9dc931886..194914d455 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
@@ -5,6 +5,7 @@
from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
from transformers.feature_extraction_utils import BatchFeature
from transformers.utils import TensorType
+from functools import lru_cache
class WhisperFeatureExtractor(SequenceFeatureExtractor):
@@ -46,32 +47,25 @@ def __init__(
norm="slaney",
mel_scale="slaney",
)
- self._hann_window_cache = {}
- self._mel_filters_cache = {}
-
- def _get_cached_feature_tensors(self, device: Union[str, torch.device]):
- device_key = str(device)
- window = self._hann_window_cache.get(device_key)
- if window is None:
- window = torch.hann_window(self.n_fft, device=device)
- self._hann_window_cache[device_key] = window
-
- mel_filters = self._mel_filters_cache.get(device_key)
- if mel_filters is None:
- mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
- self._mel_filters_cache[device_key] = mel_filters
- return window, mel_filters
+
+ @lru_cache(maxsize=12)
+ def get_hann_window(self, device: Union[str, torch.device]):
+ return torch.hann_window(self.n_fft, device=device)
+
+ @lru_cache(maxsize=12)
+ def get_mel_filters(self, device: Union[str, torch.device]):
+ return torch.from_numpy(self.mel_filters).to(device, torch.float32)
def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu") -> np.ndarray:
waveform = torch.from_numpy(waveform).to(device, torch.float32)
- window, mel_filters = self._get_cached_feature_tensors(device)
+ window = self.get_hann_window(device)
if self.dither != 0.0:
waveform += self.dither * torch.randn(waveform.shape, dtype=waveform.dtype, device=waveform.device)
stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
magnitudes = stft[..., :-1].abs() ** 2
-
+ mel_filters = self.get_mel_filters(device)
mel_spec = mel_filters.T @ magnitudes
log_spec = torch.clamp(mel_spec, min=1e-10).log10()
From be595131895792f57532be06a5988923935fae20 Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Wed, 8 Apr 2026 05:03:39 +0000
Subject: [PATCH 38/51] fix
---
lightllm/models/qwen3_omni_moe_thinker/model.py | 15 ++++-----------
1 file changed, 4 insertions(+), 11 deletions(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py
index 79ce939714..1b8fa0110d 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/model.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/model.py
@@ -1,7 +1,8 @@
import os
import json
import librosa
-from collections import OrderedDict
+import copy
+from functools import lru_cache
from io import BytesIO
from lightllm.common.build_utils import repair_config
from lightllm.models.registry import ModelRegistry
@@ -31,8 +32,6 @@
class QWen3OmniTokenizer(QWen3VLTokenizer):
def __init__(self, tokenizer=None, processor=None, **kwargs):
self.tokenizer = tokenizer
- self._prompt_encode_cache = OrderedDict()
- self._prompt_encode_cache_capacity = 64
# image
self.image_processor = processor.image_processor
self.min_pixel = self.image_processor.min_pixels
@@ -69,16 +68,9 @@ def get_audio_token_length(self, audio: AudioItem):
# print(f"token_num is {token_num} n_samples is {self.n_samples} hop_length is {self.hop_length}")
return token_num
+ @lru_cache(maxsize=128)
def _encode_prompt_text(self, prompt: str):
- cached_ids = self._prompt_encode_cache.get(prompt)
- if cached_ids is not None:
- self._prompt_encode_cache.move_to_end(prompt)
- return list(cached_ids)
-
origin_ids = self.tokenizer.encode(prompt)
- self._prompt_encode_cache[prompt] = tuple(origin_ids)
- if len(self._prompt_encode_cache) > self._prompt_encode_cache_capacity:
- self._prompt_encode_cache.popitem(last=False)
return origin_ids
def _caclu_audio_token_num(self, input_audio_len: int):
@@ -90,6 +82,7 @@ def _caclu_audio_token_num(self, input_audio_len: int):
def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs):
origin_ids = self._encode_prompt_text(prompt)
+ origin_ids = copy.deepcopy(origin_ids)
#
->
origin_ids = [token for token in origin_ids if token not in (self.image_token_id, self.audio_token_id)]
From 3b0e61353c5eb5017c57fa37c49910e868b8b39e Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Wed, 8 Apr 2026 05:19:13 +0000
Subject: [PATCH 39/51] fix
---
lightllm/server/multimodal_params.py | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index e45a28db12..6de86fd8b5 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -8,6 +8,7 @@
from io import BytesIO
from PIL import Image
from fastapi import Request
+from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
from lightllm.utils.multimodal_utils import fetch_resource
from lightllm.utils.log_utils import init_logger
@@ -15,13 +16,6 @@
logger = init_logger(__name__)
-def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray:
- num_samples = int(extra_params.get("audio_num_samples", 0))
- if num_samples > 0:
- return np.frombuffer(audio_data, dtype=np.float32, count=num_samples)
- return np.frombuffer(audio_data, dtype=np.float32)
-
-
class AudioItem:
def __init__(self, **kwargs):
self._type = kwargs["type"]
@@ -97,6 +91,12 @@ def to_origin_dict(self):
ret["data"] = self._data
return ret
+ def load_audio_from_shm_payload(self) -> np.ndarray:
+ audio_data = read_shm(get_shm_name_data(self.uuid))
+ audio_array = np.frombuffer(audio_data, dtype=np.float32)
+ assert audio_array.shape[0] == self.audio_length
+ return audio_array
+
class ImageItem:
def __init__(self, **kwargs):
From 56af31d4a1354ef29e434355471540da2a95dc5d Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Wed, 8 Apr 2026 05:21:17 +0000
Subject: [PATCH 40/51] fix
---
lightllm/server/multimodal_params.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
index 6de86fd8b5..6210628751 100644
--- a/lightllm/server/multimodal_params.py
+++ b/lightllm/server/multimodal_params.py
@@ -94,7 +94,9 @@ def to_origin_dict(self):
def load_audio_from_shm_payload(self) -> np.ndarray:
audio_data = read_shm(get_shm_name_data(self.uuid))
audio_array = np.frombuffer(audio_data, dtype=np.float32)
- assert audio_array.shape[0] == self.audio_length
+ if audio_array.shape[0] != self.audio_length:
+ logger.error(f"audio length is not match, {audio_array.shape[0]} != {self.audio_length}")
+ assert audio_array.shape[0] == self.audio_length
return audio_array
From 4a61198fabbd1e2e116905e5a1333f0b4b9e13ba Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Wed, 8 Apr 2026 05:26:15 +0000
Subject: [PATCH 41/51] fix
---
lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index 71fdb3f3b1..03c57126ff 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -9,8 +9,7 @@
from typing import Callable, Optional, Union, List
from transformers.activations import ACT2FN
-from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload
-from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
+from lightllm.server.multimodal_params import AudioItem
from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor
@@ -336,8 +335,8 @@ def encode(self, audio_items: List[AudioItem]):
if isinstance(item, AudioItem):
uuids.append(item.uuid)
items.append(item)
- audio_data = read_shm(get_shm_name_data(item.uuid))
- audio = load_audio_from_shm_payload(audio_data, item.extra_params, self.processor.sampling_rate)
+ assert self.processor.sampling_rate == 16000
+ audio = item.load_audio_from_shm_payload()
else:
raise ValueError(f"cannot read audio which type is {type(item)}!")
From ccd4b573e70b8fbbe3af0afffb3cf67caa4c66c1 Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Wed, 8 Apr 2026 05:30:02 +0000
Subject: [PATCH 42/51] fix
---
lightllm/models/whisper/whisper_audio.py | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
index 4cd9619e55..aaa29e1c71 100644
--- a/lightllm/models/whisper/whisper_audio.py
+++ b/lightllm/models/whisper/whisper_audio.py
@@ -6,8 +6,7 @@
from typing import List, Union
from safetensors.torch import load_file
from transformers.processing_utils import ProcessorMixin
-from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
-from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload
+from lightllm.server.multimodal_params import AudioItem
# tokenizer_class removed
@@ -37,7 +36,7 @@ def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
def get_T_after_cnn(self, L_in, dilation=1):
- for padding, kernel_size, stride in eval("[(1,3,1)] + [(1,3,2)] "):
+ for (padding, kernel_size, stride) in eval("[(1,3,1)] + [(1,3,2)] "):
L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
L_out = 1 + L_out // stride
L_in = L_out
@@ -168,8 +167,7 @@ def encode(self, audio_items: List[AudioItem]):
if isinstance(item, AudioItem):
uuids.append(item.uuid)
items.append(item)
- audio_data = read_shm(get_shm_name_data(item.uuid))
- audio = load_audio_from_shm_payload(audio_data, item.extra_params, 16000)
+ audio = item.load_audio_from_shm_payload()
else:
raise ValueError(f"cannot read audio which type is {type(item)}!")
@@ -217,7 +215,9 @@ def encode(self, audio_items: List[AudioItem]):
ans_embeds = []
for i in range(len(uuids)):
+
item = items[i]
+
# 拼接该 audio 的所有 chunk embedding
cur_embed = torch.cat(per_audio_embeds[i], dim=0)
ans_embeds.append(cur_embed)
From b7d11876a659fb0e12c5886f8db38c4229f50b76 Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Wed, 8 Apr 2026 05:49:05 +0000
Subject: [PATCH 43/51] fix
---
lightllm/server/httpserver/manager.py | 27 +++------------------------
1 file changed, 3 insertions(+), 24 deletions(-)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 115be4bd38..07d5936890 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -618,30 +618,16 @@ async def transfer_to_next_module(
return
if not self.args.disable_audio:
- logger.debug(
- f"lightllm_req_id:{group_req_objs.group_req_id} "
- f"stage:transfer_to_audio "
- f"target_port:{self.args.audio_port}"
- )
self.send_to_audio.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL)
return
if self.args.enable_cpu_cache:
- logger.debug(
- f"lightllm_req_id:{group_req_objs.group_req_id} "
- f"stage:transfer_to_multi_level_kv_cache target_port:{self.args.multi_level_kv_cache_port}"
- )
self.send_to_multi_level_kv_cache.send_pyobj(
group_req_objs.to_group_req_index(),
protocol=pickle.HIGHEST_PROTOCOL,
)
return
- logger.debug(
- f"lightllm_req_id:{group_req_objs.group_req_id} "
- f"stage:transfer_to_router "
- f"target_port:{self.args.router_port}"
- )
self.send_to_router.send_pyobj(
group_req_objs.to_group_req_index(),
protocol=pickle.HIGHEST_PROTOCOL,
@@ -650,11 +636,6 @@ async def transfer_to_next_module(
if self.pd_mode.is_D():
# 在 D 模式下,不需要传输真的多模态参数,因为其已经被 P 处理好了
- logger.debug(
- f"lightllm_req_id:{group_req_objs.group_req_id} "
- f"stage:transfer_to_router_from_decode "
- f"target_port:{self.args.router_port}"
- )
self.send_to_router.send_pyobj(
group_req_objs.to_group_req_index(),
protocol=pickle.HIGHEST_PROTOCOL,
@@ -673,6 +654,7 @@ async def _wait_to_token_package(
req_status: "ReqStatus",
request: Request,
):
+
event = req_status.event
unfinished_count = sampling_params.best_of
out_token_counter = 0
@@ -715,11 +697,6 @@ async def _wait_to_token_package(
first_token_cost_ms = (time.time() - start_time) * 1000
is_first_token = False
self.first_time_costs.add(first_token_cost_ms)
- logger.info(
- f"lightllm_req_id:{group_request_id} "
- f"stage:first_token_arrived elapsed_ms:{first_token_cost_ms:.3f} "
- f"sub_req_id:{sub_req_id} prompt_tokens:{prompt_tokens}"
- )
out_token_counter += 1
@@ -803,6 +780,7 @@ async def recycle_resource_loop(self):
pre_time_mark = time.time()
while True:
+
try:
await asyncio.wait_for(self.recycle_event.wait(), timeout=0.02)
except asyncio.TimeoutError:
@@ -879,6 +857,7 @@ async def handle_loop(self):
for _ in range(read_token_count):
if not req.out_tokens_queue.is_empty():
+
text, src_index, special, count_output_tokens = req.out_tokens_queue.peek()
req.cumlogprob += float(req.shm_logprobs.arr[src_index])
metadata = {
From 40cd0b9882160db09723fe5357832f42908af619 Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Wed, 8 Apr 2026 06:08:28 +0000
Subject: [PATCH 44/51] fix
---
lightllm/server/httpserver/manager.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 07d5936890..8b7dafeffe 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -301,8 +301,6 @@ async def generate(
start_time = time.time()
request_headers = request.headers if request is not None else {}
group_request_id = self.alloc_req_id(sampling_params, is_health_req)
- if request is not None:
- request.state.lightllm_req_id = group_request_id
audio_count = len(multimodal_params.audios) if multimodal_params is not None else 0
image_count = len(multimodal_params.images) if multimodal_params is not None else 0
self._log_stage_timing(
From 284815fb33022f6c5b6fda5679c8e4508dd70c66 Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Wed, 8 Apr 2026 06:09:38 +0000
Subject: [PATCH 45/51] fix
---
lightllm/server/httpserver/manager.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 8b7dafeffe..45193e928b 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -307,8 +307,8 @@ async def generate(
group_request_id,
start_time,
"received",
- has_audio=audio_count > 0,
- has_image=image_count > 0,
+ audio_count=audio_count,
+ image_count=image_count,
)
try:
From fa11c53cde2dc7d60503c77141718b3d871a1c40 Mon Sep 17 00:00:00 2001
From: wangzaijun
Date: Wed, 8 Apr 2026 06:22:24 +0000
Subject: [PATCH 46/51] fix
---
lightllm/server/httpserver/manager.py | 31 +++------------------------
1 file changed, 3 insertions(+), 28 deletions(-)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 45193e928b..610931784c 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -322,8 +322,6 @@ async def generate(
group_request_id,
start_time,
"verify_and_preload_done",
- audio_count=audio_count,
- image_count=image_count,
)
# 记录请求到达的相关信息
@@ -334,9 +332,6 @@ async def generate(
group_request_id,
start_time,
"encode_done",
- prompt_tokens=len(prompt_ids),
- audio_count=audio_count,
- image_count=image_count,
)
prompt_tokens = len(prompt_ids)
@@ -350,8 +345,6 @@ async def generate(
group_request_id,
start_time,
"check_and_repair_length_done",
- prompt_tokens=len(prompt_ids),
- max_new_tokens=sampling_params.max_new_tokens,
)
if nixl_pd_upload_websocket is not None and not is_health_req and self.pd_mode.is_NP():
@@ -404,7 +397,6 @@ async def generate(
group_request_id,
start_time,
"shm_req_init_done",
- req_count=len(req_objs),
)
logger.debug(
@@ -423,8 +415,6 @@ async def generate(
group_request_id,
start_time,
"request_forwarded",
- has_audio=audio_count > 0,
- has_image=image_count > 0,
)
results_generator = self._wait_to_token_package(
@@ -481,6 +471,7 @@ def _count_multimodal_tokens(self, multimodal_params: MultimodalParams) -> Tuple
return image_tokens, audio_tokens
async def _log_req_header(self, request_headers, group_request_id: int):
+
x_request_id = request_headers.get("X-Request-Id", "")
x_session_id = request_headers.get("X-Session-Id", "")
@@ -493,11 +484,7 @@ async def _log_req_header(self, request_headers, group_request_id: int):
return
async def _encode(
- self,
- prompt: Union[str, List[int]],
- multimodal_params: MultimodalParams,
- sampling_params: SamplingParams,
- start_time: Optional[float] = None,
+ self, prompt: Union[str, List[int]], multimodal_params: MultimodalParams, sampling_params: SamplingParams
):
if isinstance(prompt, str):
if self.enable_multimodal:
@@ -507,14 +494,6 @@ async def _encode(
if multimodal_params.audios:
assert not self.args.disable_audio, "audio multimodal not enabled"
await self._alloc_multimodal_resources(multimodal_params, sampling_params)
- log_req_id = getattr(sampling_params, "group_request_id", None)
- self._log_stage_timing(
- log_req_id,
- start_time,
- "alloc_multimodal_resources_done",
- audio_count=len(multimodal_params.audios),
- image_count=len(multimodal_params.images),
- )
prompt_ids = self.tokenizer.encode(
prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens
)
@@ -605,13 +584,9 @@ async def transfer_to_next_module(
self,
group_req_objs: Optional[GroupReqObjs] = None,
):
+
if self.pd_mode.is_P_or_NORMAL():
if not self.args.disable_vision:
- logger.debug(
- f"lightllm_req_id:{group_req_objs.group_req_id} "
- f"stage:transfer_to_visual "
- f"target_port:{self.args.visual_port}"
- )
self.send_to_visual.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL)
return
From 44c63d97eb26ac5ef3e946fa6e98acc15ae4fa14 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Wed, 8 Apr 2026 07:34:56 +0000
Subject: [PATCH 47/51] fix
---
lightllm/server/httpserver/manager.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 610931784c..c9822ff618 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -327,7 +327,7 @@ async def generate(
# 记录请求到达的相关信息
await self._log_req_header(request_headers, group_request_id)
# encode
- prompt_ids = await self._encode(prompt, multimodal_params, sampling_params, start_time=start_time)
+ prompt_ids = await self._encode(prompt, multimodal_params, sampling_params)
self._log_stage_timing(
group_request_id,
start_time,
From c5cc9952105dd955b04f1ced509ce1294b7227c3 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Wed, 8 Apr 2026 09:09:12 +0000
Subject: [PATCH 48/51] support long audio
---
lightllm/models/qwen3_omni_moe_thinker/audio_process.py | 2 +-
lightllm/models/qwen3_omni_moe_thinker/model.py | 7 +------
2 files changed, 2 insertions(+), 7 deletions(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
index 194914d455..58b223d579 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py
@@ -102,7 +102,7 @@ def zero_mean_unit_var_norm(
def _preprocess(
self,
raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
- truncation: bool = True,
+ truncation: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_attention_mask: Optional[bool] = None,
diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py
index 1b8fa0110d..bee15e3d2a 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/model.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/model.py
@@ -59,12 +59,7 @@ def init_audioitem_extral_params(
return
def get_audio_token_length(self, audio: AudioItem):
- # 这里得处理对应奖语音长度按照 30 进行限制,后续处理中,超过30的会被截断。
- if audio.audio_length > self.n_samples:
- logger.warning(f"audio length {audio.audio_length} exceed max length {self.n_samples}, will be truncated.")
-
- length = min(audio.audio_length, int(self.n_samples))
- token_num = self._caclu_audio_token_num(length)
+ token_num = self._caclu_audio_token_num(audio.audio_length)
# print(f"token_num is {token_num} n_samples is {self.n_samples} hop_length is {self.hop_length}")
return token_num
From eb4558a906da5c4c8aa9ab9f8e90d18382f70cc1 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 9 Apr 2026 04:45:20 +0000
Subject: [PATCH 49/51] add check_long_audio_infer
---
.../qwen3_omni_audio.py | 23 +++++++++++++++++++
lightllm/models/whisper/whisper_audio.py | 3 +++
.../audioserver/model_infer/model_rpc.py | 1 +
3 files changed, 27 insertions(+)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index 03c57126ff..9fb4e1d1db 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -10,10 +10,13 @@
from transformers.activations import ACT2FN
from lightllm.server.multimodal_params import AudioItem
+from lightllm.utils.log_utils import init_logger
from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor
+logger = init_logger(__name__)
+
def _get_feat_extract_output_lengths(input_lengths):
"""
@@ -259,6 +262,7 @@ def load_model(self, weight_dir, config):
self.load_state_dict(weight_dict)
+ @torch.inference_mode()
def forward(
self,
input_features,
@@ -327,6 +331,7 @@ def forward(
hidden_states = self.proj2(hidden_states)
return hidden_states
+ @torch.inference_mode()
def encode(self, audio_items: List[AudioItem]):
uuids = []
items: List[AudioItem] = []
@@ -363,3 +368,21 @@ def encode(self, audio_items: List[AudioItem]):
all_embeds.append(cur_embed)
return all_embeds, audio_items
+
+ @torch.inference_mode()
+ def check_long_audio_infer(self):
+ """Exercise forward with mel length chosen so the conv loop runs once with batch dim == conv_chunksize."""
+ device = next(self.parameters()).device
+ frame_len = self.conv_chunksize * (self.n_window * 2)
+ logger.info(
+ "check_long_audio_infer: start frame_len=%s conv_chunksize=%s n_window=%s device=%s dtype=%s",
+ frame_len,
+ self.conv_chunksize,
+ self.n_window,
+ device,
+ self.data_type,
+ )
+ input_features = torch.zeros(self.num_mel_bins, frame_len, device=device, dtype=self.data_type)
+ feature_lens = torch.tensor([frame_len], device=device, dtype=torch.long)
+ out = self.forward(input_features, feature_lens=feature_lens)
+ logger.info("check_long_audio_infer: done output_shape=%s", tuple(out.shape))
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
index aaa29e1c71..8a984d29a5 100644
--- a/lightllm/models/whisper/whisper_audio.py
+++ b/lightllm/models/whisper/whisper_audio.py
@@ -223,3 +223,6 @@ def encode(self, audio_items: List[AudioItem]):
ans_embeds.append(cur_embed)
return ans_embeds, audio_items
+
+ def check_long_audio_infer(self):
+ pass
diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py
index 39a7e06ac3..82919856d9 100644
--- a/lightllm/server/audioserver/model_infer/model_rpc.py
+++ b/lightllm/server/audioserver/model_infer/model_rpc.py
@@ -51,6 +51,7 @@ def exposed_init_model(self, kvargs):
self.model.load_model(weight_dir, model_cfg)
self.model = self.model.cuda()
+ self.model.check_long_audio_infer()
self.cache_client = rpyc.connect("localhost", self.cache_port, config={"allow_pickle": True})
self.cache_client._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
From 94ffee1fab412071a93dfc69abaeeb3d0fd43356 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 9 Apr 2026 07:05:03 +0000
Subject: [PATCH 50/51] add LIGHTLLM_QWEN3_OMNI_CONV_CHUNKSIZE
---
lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index 9fb4e1d1db..c81e1d5859 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -15,6 +15,8 @@
from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor
+QWEN3_OMNI_CONV_CHUNKSIZE = int(os.getenv("LIGHTLLM_QWEN3_OMNI_CONV_CHUNKSIZE", 500))
+
logger = init_logger(__name__)
@@ -159,7 +161,7 @@ def __init__(
activation_function="gelu",
output_dim=2048,
n_window_infer=800,
- conv_chunksize=500,
+ conv_chunksize=QWEN3_OMNI_CONV_CHUNKSIZE,
encoder_attention_heads=20,
attention_dropout=0,
activation_dropout=0,
From 0553276514fff3eef059306b917c8a9f6084dced Mon Sep 17 00:00:00 2001
From: wanzihao <1060304770@qq.com>
Date: Thu, 9 Apr 2026 15:14:34 +0800
Subject: [PATCH 51/51] Apply suggestions from code review. Use params.dtype
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
.../models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
index c81e1d5859..ff49ab160a 100644
--- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
+++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
@@ -374,7 +374,9 @@ def encode(self, audio_items: List[AudioItem]):
@torch.inference_mode()
def check_long_audio_infer(self):
"""Exercise forward with mel length chosen so the conv loop runs once with batch dim == conv_chunksize."""
- device = next(self.parameters()).device
+ params = next(self.parameters())
+ device = params.device
+ dtype = params.dtype
frame_len = self.conv_chunksize * (self.n_window * 2)
logger.info(
"check_long_audio_infer: start frame_len=%s conv_chunksize=%s n_window=%s device=%s dtype=%s",
@@ -382,9 +384,9 @@ def check_long_audio_infer(self):
self.conv_chunksize,
self.n_window,
device,
- self.data_type,
+ dtype,
)
- input_features = torch.zeros(self.num_mel_bins, frame_len, device=device, dtype=self.data_type)
+ input_features = torch.zeros(self.num_mel_bins, frame_len, device=device, dtype=dtype)
feature_lens = torch.tensor([frame_len], device=device, dtype=torch.long)
out = self.forward(input_features, feature_lens=feature_lens)
logger.info("check_long_audio_infer: done output_shape=%s", tuple(out.shape))