From c8b388845b80335e532f930ff09e8be41d050eaa Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Thu, 26 Mar 2026 08:17:31 +0000 Subject: [PATCH 01/51] qwen3_vl_moe support prefill_cudagraph --- .../layer_infer/transformer_layer_infer.py | 39 ++++++++++++++++++- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py index 391ee8bf6b..40d4bbc0ad 100644 --- a/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py +++ b/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py @@ -1,12 +1,14 @@ import torch import torch.distributed as dist from typing import Tuple +from lightllm.common.basemodel.infer_struct import InferStateInfo from lightllm.models.qwen2_vl.triton_kernel.mrope import mrope_triton_fused from lightllm.models.qwen3_moe.layer_infer.transformer_layer_infer import Qwen3MOETransformerLayerInfer from lightllm.models.qwen3_moe.layer_weights.transformer_layer_weight import Qwen3MOETransformerLayerWeight from lightllm.models.qwen3_vl.infer_struct import Qwen3VLInferStateInfo from lightllm.distributed import all_reduce from lightllm.models.qwen3_vl.triton_kernel.deepstack_multimodal_emb import apply_deepstack_features +from lightllm.utils.tensor_utils import tensor_to_no_ref_tensor class Qwen3VLMOETransformerLayerInfer(Qwen3MOETransformerLayerInfer): @@ -48,7 +50,7 @@ def context_forward(self, input_embdings, infer_state: Qwen3VLInferStateInfo, la q, cache_kv = self._get_qkv(input1, infer_state, layer_weight) input1 = None self._post_cache_kv(cache_kv, infer_state, layer_weight) - o = self._context_attention_kernel(q, cache_kv, infer_state, layer_weight) + o = self._context_attention_wrapper_run(q, cache_kv, infer_state, layer_weight) q = None o = self._get_o(o, infer_state, layer_weight) if self.tp_world_size_ > 1: @@ -62,9 +64,42 @@ def context_forward(self, input_embdings, infer_state: Qwen3VLInferStateInfo, la if self.tp_world_size_ > 1: all_reduce(ffn_out, op=dist.ReduceOp.SUM, group=infer_state.dist_group, async_op=False) input_embdings.add_(ffn_out.view(-1, self.embed_dim_)) - apply_deepstack_features( + self._apply_deepstack_features_wrapper_run( input_embeddings=input_embdings, infer_state=infer_state, layer_num=self.layer_num_, ) return input_embdings + + def _apply_deepstack_features_wrapper_run( + self, + input_embeddings: torch.Tensor, + infer_state: InferStateInfo, + layer_num: int, + ): + if torch.cuda.is_current_stream_capturing(): + input_embeddings = input_embeddings.contiguous() + _input_embeddings = tensor_to_no_ref_tensor(input_embeddings) + pre_capture_graph = infer_state.prefill_cuda_graph_get_current_capture_graph() + pre_capture_graph.__exit__(None, None, None) + + infer_state.prefill_cuda_graph_create_graph_obj() + infer_state.prefill_cuda_graph_get_current_capture_graph().__enter__() + + def apply_func(new_infer_state: InferStateInfo): + apply_deepstack_features( + input_embeddings=_input_embeddings, + infer_state=new_infer_state, + layer_num=layer_num, + ) + return + + infer_state.prefill_cuda_graph_add_cpu_runnning_func(func=apply_func, after_graph=pre_capture_graph) + else: + apply_deepstack_features( + input_embeddings=input_embeddings, + infer_state=infer_state, + layer_num=layer_num, + ) + + return From e7fba3af30bb723dcf9909a6d06bbb9ff514134b Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Fri, 27 Mar 2026 05:17:57 +0000 Subject: [PATCH 02/51] add audio dp --- lightllm/server/api_cli.py | 4 ++ lightllm/server/api_start.py | 21 +++++- lightllm/server/audioserver/manager.py | 41 ++++++----- .../audioserver/model_infer/model_rpc.py | 68 ++++++++++++++----- lightllm/server/core/objs/start_args_type.py | 6 +- 5 files changed, 105 insertions(+), 35 deletions(-) diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py index d32da8097c..776fbc8247 100644 --- a/lightllm/server/api_cli.py +++ b/lightllm/server/api_cli.py @@ -426,6 +426,9 @@ def make_argument_parser() -> argparse.ArgumentParser: parser.add_argument( "--visual_infer_batch_size", type=int, default=None, help="number of images to process in each inference batch" ) + parser.add_argument( + "--audio_infer_batch_size", type=int, default=None, help="number of audios to process in each inference batch" + ) parser.add_argument( "--visual_send_batch_size", type=int, @@ -440,6 +443,7 @@ def make_argument_parser() -> argparse.ArgumentParser: ) parser.add_argument("--visual_tp", type=int, default=1, help="number of tensort parallel instances for ViT") parser.add_argument("--visual_dp", type=int, default=1, help="number of data parallel instances for ViT") + parser.add_argument("--audio_dp", type=int, default=1, help="number of data parallel instances for audio encoder") parser.add_argument( "--visual_nccl_ports", nargs="+", diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py index 364f9ca281..180b16e658 100644 --- a/lightllm/server/api_start.py +++ b/lightllm/server/api_start.py @@ -188,6 +188,9 @@ def normal_or_p_d_start(args): if args.visual_dp <= 0: raise ValueError("visual_dp must be a positive integer.") + if args.audio_dp <= 0: + raise ValueError("audio_dp must be a positive integer.") + if args.visual_infer_batch_size is None: args.visual_infer_batch_size = args.visual_dp @@ -198,6 +201,15 @@ def normal_or_p_d_start(args): f"a positive integer multiple of visual_dp ({args.visual_dp})" ) + if args.audio_infer_batch_size is None: + args.audio_infer_batch_size = args.audio_dp * 2 + + if args.audio_infer_batch_size // args.audio_dp < 1 or args.audio_infer_batch_size % args.audio_dp != 0: + raise ValueError( + f"audio_infer_batch_size ({args.audio_infer_batch_size}) must be " + f"a positive integer multiple of audio_dp ({args.audio_dp})" + ) + if args.disable_chunked_prefill: args.chunked_prefill_size = args.max_req_total_len # 普通模式下 @@ -247,8 +259,10 @@ def normal_or_p_d_start(args): ports_locker.lock_port() node_world_size = args.tp // args.nnodes + audio_model_dp_ports_num = 0 if args.disable_audio else args.audio_dp can_use_ports = alloc_can_use_network_port( - num=10 + node_world_size + args.visual_dp * (args.visual_tp + 1), used_ports=already_uesd_ports + num=10 + node_world_size + args.visual_dp * (args.visual_tp + 1) + audio_model_dp_ports_num, + used_ports=already_uesd_ports, ) logger.info(f"alloced ports: {can_use_ports}") ( @@ -274,6 +288,9 @@ def normal_or_p_d_start(args): visual_nccl_ports.append(can_use_ports[0]) can_use_ports = can_use_ports[1:] + audio_model_dp_ports = can_use_ports[0:audio_model_dp_ports_num] + can_use_ports = can_use_ports[audio_model_dp_ports_num:] + # 将申请好的端口放入args参数中 if args.nccl_port is None: args.nccl_port = nccl_port @@ -342,7 +359,7 @@ def normal_or_p_d_start(args): start_audio_process, ], start_args=[ - (args,), + (args, audio_model_dp_ports), ], ) diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py index bb0a745302..f7cb300aaf 100644 --- a/lightllm/server/audioserver/manager.py +++ b/lightllm/server/audioserver/manager.py @@ -26,7 +26,7 @@ class AudioManager: def __init__( self, args: StartArgs, - infer_batch_size=4, + audio_model_rpc_ports, ): context = zmq.asyncio.Context(2) @@ -45,29 +45,32 @@ def __init__( self.waiting_reqs: List[GroupReqIndexes] = [] self.model_weightdir = args.model_dir self.tp_world_size = args.tp - self.world_size = 1 - self.infer_batch_size = infer_batch_size + self.audio_dp = args.audio_dp + self.infer_batch_size = args.audio_infer_batch_size self.trust_remote_code = args.trust_remote_code self.args = args + self.audio_model_rpc_ports = audio_model_rpc_ports or [None] * self.audio_dp self.shm_req_manager = ShmReqManager() + self.model_rpcs: List[AudioModelRpcClient] = [] async def wait_to_model_ready(self): - - self.model_rpcs: List[AudioModelRpcClient] = [] - for rank_id in range(self.world_size): - rpc_model = await start_model_process(world_size=self.world_size) + self.model_rpcs = [] + for dp_rank_id in range(self.audio_dp): + rpc_model = await start_model_process( + world_size=self.audio_dp, port=self.audio_model_rpc_ports[dp_rank_id], device_id=dp_rank_id + ) self.model_rpcs.append(rpc_model) init_model_ret = [] - for rank_id in range(self.world_size): + for dp_rank_id in range(self.audio_dp): kvargs = { "weight_dir": self.model_weightdir, "trust_remote_code": self.trust_remote_code, - "rank_id": rank_id, + "dp_rank_id": dp_rank_id, "cache_port": self.cache_port, "data_type": self.args.data_type, } - init_model_ret.append(self.model_rpcs[rank_id].init_model(kvargs)) + init_model_ret.append(self.model_rpcs[dp_rank_id].init_model(kvargs)) await asyncio.gather(*init_model_ret) return @@ -75,7 +78,11 @@ async def infer_audios(self, audios: List[AudioItem]): if len(audios) == 0: return - rets = [self.model_rpcs[tp_rank].encode(audios) for tp_rank in range(self.world_size)] + rets = [] + for dp_rank_id in range(self.audio_dp): + assigned_audios = [audios[i] for i in range(dp_rank_id, len(audios), self.audio_dp)] + if assigned_audios: + rets.append(self.model_rpcs[dp_rank_id].encode(assigned_audios)) await asyncio.gather(*rets) return @@ -148,19 +155,21 @@ async def loop_for_netio_req(self): def clean_up(self): for model_rpc in self.model_rpcs: - model_rpc.rpc_server_process.kill() + if model_rpc.rpc_server_process is not None: + model_rpc.rpc_server_process.kill() for model_rpc in self.model_rpcs: - model_rpc.rpc_server_process.join() + if model_rpc.rpc_server_process is not None: + model_rpc.rpc_server_process.join() return -def start_audio_process(args, pipe_writer): +def start_audio_process(args, model_rpc_ports, pipe_writer): # 注册graceful 退出的处理 graceful_registry(inspect.currentframe().f_code.co_name) setproctitle.setproctitle(f"lightllm::{get_unique_server_name()}::audio_server") + audioserver = AudioManager(args=args, audio_model_rpc_ports=model_rpc_ports) try: - audioserver = AudioManager(args=args) asyncio.run(audioserver.wait_to_model_ready()) except Exception as e: logger.exception(str(e)) @@ -170,7 +179,7 @@ def start_audio_process(args, pipe_writer): pipe_writer.send("init ok") def handle_exception(loop, context): - logger.exception(f"VisualServer Caught exception: {str(context)}") + logger.exception(f"AudioServer Caught exception: {str(context)}") loop = asyncio.new_event_loop() loop.set_exception_handler(handle_exception) diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py index a8a2c39c3e..cbd39666a0 100644 --- a/lightllm/server/audioserver/model_infer/model_rpc.py +++ b/lightllm/server/audioserver/model_infer/model_rpc.py @@ -1,22 +1,25 @@ import asyncio import rpyc +import socket import torch -from typing import Dict, List, Tuple +import inspect +from typing import List +from rpyc.utils.classic import obtain +from rpyc.utils.server import ThreadedServer from transformers.configuration_utils import PretrainedConfig from lightllm.models.whisper.whisper_audio import WhisperAudioModel from lightllm.models.qwen3_omni_moe_thinker.qwen3_omni_audio import Qwen3OmniMoeAudioEncoder from lightllm.server.multimodal_params import AudioItem from lightllm.utils.infer_utils import set_random_seed from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient +from lightllm.utils.graceful_utils import graceful_registry class AudioModelRpcServer(rpyc.Service): def exposed_init_model(self, kvargs): - # 注册graceful 退出的处理 - from lightllm.utils.graceful_utils import graceful_registry - import inspect - - graceful_registry(inspect.currentframe().f_code.co_name) + kvargs = obtain(kvargs) + self.dp_rank_id = kvargs["dp_rank_id"] + torch.cuda.set_device(self.dp_rank_id) weight_dir = kvargs["weight_dir"] model_cfg, _ = PretrainedConfig.get_config_dict(weight_dir) @@ -41,7 +44,7 @@ def exposed_init_model(self, kvargs): # CpuEmbedCacheClient 的初始化需要依赖这个设置的环境信息。 from lightllm.utils.dist_utils import set_current_device_id - set_current_device_id(torch.cuda.current_device()) + set_current_device_id(self.dp_rank_id) self.cpu_embed_cache_client = CpuEmbedCacheClient( create_meta_data=False, @@ -65,6 +68,8 @@ def forward(self, audios): # @calculate_time(show=False, min_cost_ms=300) def exposed_encode(self, audios): + torch.cuda.set_device(self.dp_rank_id) + audios = obtain(audios) return self.forward(audios) @@ -74,6 +79,7 @@ def __init__(self, model_rpc, world_size, rpc_server_process=None): self.world_size = world_size self.rpc_server_process = rpc_server_process self.use_rpc = self.world_size != 1 + if self.use_rpc: def async_wrap(f): @@ -82,7 +88,6 @@ def async_wrap(f): async def _func(*args, **kwargs): ans = f(*args, **kwargs) await asyncio.to_thread(ans.wait) - # raise if exception return ans.value return _func @@ -95,21 +100,52 @@ async def _func(*args, **kwargs): return async def init_model(self, kvargs): - ans: rpyc.AsyncResult = self._init_model(kvargs) + ans = self._init_model(kvargs) if self.use_rpc: - await ans - return - else: - return + return await ans + return ans async def encode(self, audios: List[AudioItem]): ans = self._encode(audios) if self.use_rpc: return await ans - else: - return ans + return ans + +def _init_env(port, device_id): + graceful_registry(inspect.currentframe().f_code.co_name) + torch.cuda.set_device(device_id) -async def start_model_process(world_size): + from lightllm.utils.dist_utils import set_current_device_id + import lightllm.utils.rpyc_fix_utils as _ + + set_current_device_id(device_id) + t = ThreadedServer(AudioModelRpcServer(), port=port, protocol_config={"allow_pickle": True}) + t.start() + return + + +async def start_model_process(world_size, port=None, device_id=None): if world_size == 1: return AudioModelRpcClient(AudioModelRpcServer(), world_size) + + import multiprocessing + + proc = multiprocessing.Process(target=_init_env, args=(port, device_id)) + proc.start() + await asyncio.sleep(2) + repeat_count = 0 + while repeat_count < 20: + try: + con = rpyc.connect("localhost", port, config={"allow_pickle": True}) + con._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) + break + except BaseException: + await asyncio.sleep(1) + repeat_count += 1 + + if repeat_count == 20: + raise Exception("init rpc env error!") + + assert proc.is_alive() + return AudioModelRpcClient(con.root, world_size, rpc_server_process=proc) diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py index 37c022f3a3..8411a14e3c 100644 --- a/lightllm/server/core/objs/start_args_type.py +++ b/lightllm/server/core/objs/start_args_type.py @@ -99,10 +99,12 @@ class StartArgs: grouping_key: List[str] = field(default_factory=list) push_interval: int = field(default=10) visual_infer_batch_size: int = field(default=None) + audio_infer_batch_size: int = field(default=None) visual_send_batch_size: int = field(default=1) visual_gpu_ids: List[int] = field(default_factory=lambda: [0]) visual_tp: int = field(default=1) visual_dp: int = field(default=1) + audio_dp: int = field(default=1) visual_nccl_ports: List[int] = field(default=None) enable_monitor_auth: bool = field(default=False) disable_cudagraph: bool = field(default=False) @@ -125,7 +127,9 @@ class StartArgs: vit_att_backend: List[str] = field( default=("auto",), metadata={"choices": ["auto", "triton", "fa3", "sdpa", "xformers"]} ) - llm_kv_type: str = field(default="None", metadata={"choices": ["None", "int8kv", "int4kv", "fp8kv_sph", "fp8kv_spt"]}) + llm_kv_type: str = field( + default="None", metadata={"choices": ["None", "int8kv", "int4kv", "fp8kv_sph", "fp8kv_spt"]} + ) llm_kv_quant_group_size: int = field(default=8) sampling_backend: str = field(default="triton", metadata={"choices": ["triton", "sglang_kernel"]}) penalty_counter_mode: str = field( From 671b5aa446b970c575c0f02ebb36d60f091e9ba8 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Fri, 27 Mar 2026 14:34:57 +0000 Subject: [PATCH 03/51] Add startup warmups for HTTP audio preload and per-rank audio workers to remove first-request audio cold- start latency. --- .../qwen3_omni_audio.py | 19 ++++ lightllm/models/whisper/whisper_audio.py | 18 ++++ lightllm/server/api_http.py | 7 +- lightllm/server/audioserver/manager.py | 101 +++++++++++++++++- .../audioserver/model_infer/model_rpc.py | 38 ++++++- lightllm/server/httpserver/manager.py | 83 ++++++++++++++ lightllm/server/multimodal_params.py | 35 ++++++ 7 files changed, 297 insertions(+), 4 deletions(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py index 3573ecde86..6c620448b9 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py @@ -389,3 +389,22 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC if ids_to_set: self.cache_client.root.set_items_embed(ids=ids_to_set) torch.cuda.current_stream().synchronize() + + @torch.no_grad() + def warmup(self, audio_bytes: bytes): + audio = BytesIO(audio_bytes) + audio, _ = librosa.load(audio, sr=self.processor.sampling_rate) + input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True) + if feature_attention_mask is not None: + audio_feature_lengths = torch.sum(feature_attention_mask, dim=1) + input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0) + else: + audio_feature_lengths = None + + feature_lens = audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1) + _ = self.forward( + input_features, + feature_lens=feature_lens, + ) + torch.cuda.current_stream().synchronize() + return diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py index 36c9408cb8..0493afdb9a 100644 --- a/lightllm/models/whisper/whisper_audio.py +++ b/lightllm/models/whisper/whisper_audio.py @@ -241,3 +241,21 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC if ids_to_set: self.cache_client.root.set_items_embed(ids=ids_to_set) torch.cuda.current_stream().synchronize() + + @torch.no_grad() + def warmup(self, audio_bytes: bytes): + audio = BytesIO(audio_bytes) + audio, _ = librosa.load(audio, sr=16000) + + from .defaults import MIN_AUDIO_LEN + + if audio.shape[0] < MIN_AUDIO_LEN: + audio = np.pad(audio, (0, MIN_AUDIO_LEN - len(audio)), mode="constant", constant_values=0.0) + + batch_audio_lens = np.array([min(audio.shape[0], self.max_length)], dtype=np.int32) + audios, audio_lens_after_cnn = self.audio_processor( + [audio], batch_audio_lens, sampling_rate=16000, return_tensors="pt" + ) + _ = self.forward(audios, audio_lens_after_cnn) + torch.cuda.current_stream().synchronize() + return diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py index 230da5b369..6be738befc 100755 --- a/lightllm/server/api_http.py +++ b/lightllm/server/api_http.py @@ -40,7 +40,7 @@ from fastapi.responses import Response, StreamingResponse, JSONResponse from lightllm.server.core.objs.sampling_params import SamplingParams from lightllm.server.core.objs import StartArgs -from .multimodal_params import MultimodalParams +from .multimodal_params import MultimodalParams, warmup_audio_preload from .httpserver.manager import HttpServerManager from .httpserver_for_pd_master.manager import HttpServerManagerForPDMaster from .api_lightllm import lightllm_get_score @@ -359,6 +359,11 @@ async def startup_event(): logger.info("server start up") loop = asyncio.get_event_loop() g_objs.set_args(get_env_start_args()) + if g_objs.args.enable_multimodal and not g_objs.args.disable_audio: + warmup_start = time.time() + logger.info("http_audio_preload_warmup_start") + await warmup_audio_preload() + logger.info(f"http_audio_preload_warmup_done elapsed_ms:{(time.time() - warmup_start) * 1000.0:.3f}") loop.create_task(g_objs.httpserver_manager.handle_loop()) logger.info(f"server start up ok, loop use is {asyncio.get_event_loop()}") return diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py index f7cb300aaf..b4fb002965 100644 --- a/lightllm/server/audioserver/manager.py +++ b/lightllm/server/audioserver/manager.py @@ -7,7 +7,8 @@ import socket import inspect import setproctitle -from typing import List +import time +from typing import Dict, List asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) from lightllm.utils.log_utils import init_logger @@ -52,6 +53,27 @@ def __init__( self.audio_model_rpc_ports = audio_model_rpc_ports or [None] * self.audio_dp self.shm_req_manager = ShmReqManager() self.model_rpcs: List[AudioModelRpcClient] = [] + self.req_stage_times: Dict[int, Dict[str, float]] = {} + self.next_module_port = args.multi_level_kv_cache_port if args.enable_cpu_cache else args.router_port + + def _mark_req_stage(self, req_id: int, stage: str): + now = time.time() + req_stage_dict = self.req_stage_times.setdefault(req_id, {}) + if "audio_recv" not in req_stage_dict: + req_stage_dict["audio_recv"] = now + req_stage_dict[stage] = now + return now - req_stage_dict["audio_recv"] + + def _log_req_stage(self, req_id: int, stage: str, **kwargs): + elapsed_s = self._mark_req_stage(req_id, stage) + extras = " ".join(f"{k}:{v}" for k, v in kwargs.items()) + suffix = f" {extras}" if extras else "" + logger.info(f"lightllm_req_id:{req_id} stage:{stage} elapsed_ms:{elapsed_s * 1000.0:.3f}{suffix}") + return + + def _cleanup_req_stage(self, req_id: int): + self.req_stage_times.pop(req_id, None) + return async def wait_to_model_ready(self): self.model_rpcs = [] @@ -72,18 +94,37 @@ async def wait_to_model_ready(self): } init_model_ret.append(self.model_rpcs[dp_rank_id].init_model(kvargs)) await asyncio.gather(*init_model_ret) + + warmup_start = time.time() + logger.info(f"audio_warmup_start audio_dp:{self.audio_dp}") + + async def warmup_one_rank(dp_rank_id: int): + rank_start = time.time() + logger.info(f"audio_warmup_rank_start dp_rank_id:{dp_rank_id}") + await self.model_rpcs[dp_rank_id].warmup_model() + logger.info( + f"audio_warmup_rank_done dp_rank_id:{dp_rank_id} elapsed_ms:{(time.time() - rank_start) * 1000.0:.3f}" + ) + + await asyncio.gather(*[warmup_one_rank(dp_rank_id) for dp_rank_id in range(self.audio_dp)]) + logger.info(f"audio_warmup_done elapsed_ms:{(time.time() - warmup_start) * 1000.0:.3f}") return async def infer_audios(self, audios: List[AudioItem]): if len(audios) == 0: return + infer_start = time.time() rets = [] for dp_rank_id in range(self.audio_dp): assigned_audios = [audios[i] for i in range(dp_rank_id, len(audios), self.audio_dp)] if assigned_audios: rets.append(self.model_rpcs[dp_rank_id].encode(assigned_audios)) await asyncio.gather(*rets) + logger.info( + f"audio_infer_batch_done audio_count:{len(audios)} audio_dp:{self.audio_dp} " + f"elapsed_ms:{(time.time() - infer_start) * 1000.0:.3f}" + ) return @@ -96,6 +137,11 @@ async def loop_for_fwd(self): audios_need_infer = [] while len(self.waiting_reqs) > 0: group_req_indexes = self.waiting_reqs.pop(0) + self._log_req_stage( + group_req_indexes.group_req_id, + "audio_queue_picked", + waiting_queue_size=len(self.waiting_reqs), + ) shm_req = self.shm_req_manager.get_req_obj_by_index(group_req_indexes.shm_req_indexes[0]) disable_prompt_cache = shm_req.sample_params.disable_prompt_cache is_aborted = shm_req.is_aborted @@ -105,6 +151,7 @@ async def loop_for_fwd(self): # 因为采用 shm 来映射所有的 req 对象以后,引用管理情况复杂了 # 需要一些一致的流程来保证不出现异步问题。 self.send_to_next_module.send_pyobj(group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL) + self._cleanup_req_stage(group_req_indexes.group_req_id) continue multimodal_params = group_req_indexes.multimodal_params @@ -116,28 +163,74 @@ async def loop_for_fwd(self): else: ready_audio = obtain(self.cache_client.root.get_items_embed(audio_uuids)) + current_req_has_pending_audio = False for audio, ready in zip(multimodal_params.audios, ready_audio): if not ready: audios_need_infer.append(audio) + current_req_has_pending_audio = True if len(audios_need_infer) == self.infer_batch_size: + batch_reqs = processing_group_reqs + ( + [group_req_indexes] if current_req_has_pending_audio else [] + ) + batch_req_ids = [req.group_req_id for req in batch_reqs] + logger.info( + f"audio_batch_ready req_ids:{batch_req_ids} " + f"audio_count:{len(audios_need_infer)} infer_batch_size:{self.infer_batch_size}" + ) + for batch_req_id in batch_req_ids: + self._log_req_stage( + batch_req_id, "audio_infer_start", batch_audio_count=len(audios_need_infer) + ) await self.infer_audios(audios_need_infer) + for batch_req_id in batch_req_ids: + self._log_req_stage( + batch_req_id, "audio_infer_done", batch_audio_count=len(audios_need_infer) + ) audios_need_infer = [] for _group_req_indexes in processing_group_reqs: + self._log_req_stage( + _group_req_indexes.group_req_id, + "audio_send_to_next_module", + target_port=self.next_module_port, + ) self.send_to_next_module.send_pyobj( _group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL ) + self._cleanup_req_stage(_group_req_indexes.group_req_id) processing_group_reqs = [] if len(audios_need_infer) == 0: + self._log_req_stage( + group_req_indexes.group_req_id, + "audio_send_to_next_module", + target_port=self.next_module_port, + pending_audio_count=0, + ) self.send_to_next_module.send_pyobj(group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL) + self._cleanup_req_stage(group_req_indexes.group_req_id) else: processing_group_reqs.append(group_req_indexes) if len(audios_need_infer) > 0: + batch_req_ids = [req.group_req_id for req in processing_group_reqs] + logger.info( + f"audio_batch_ready req_ids:{batch_req_ids} " + f"audio_count:{len(audios_need_infer)} infer_batch_size:{self.infer_batch_size}" + ) + for batch_req_id in batch_req_ids: + self._log_req_stage(batch_req_id, "audio_infer_start", batch_audio_count=len(audios_need_infer)) await self.infer_audios(audios_need_infer) + for batch_req_id in batch_req_ids: + self._log_req_stage(batch_req_id, "audio_infer_done", batch_audio_count=len(audios_need_infer)) for _group_req_indexes in processing_group_reqs: + self._log_req_stage( + _group_req_indexes.group_req_id, + "audio_send_to_next_module", + target_port=self.next_module_port, + ) self.send_to_next_module.send_pyobj(_group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL) + self._cleanup_req_stage(_group_req_indexes.group_req_id) processing_group_reqs = [] audios_need_infer = [] @@ -149,6 +242,12 @@ async def loop_for_netio_req(self): f"audio recv req id {recv_req.group_req_id} " f"audio count {len(recv_req.multimodal_params.audios)}" ) + self._log_req_stage( + recv_req.group_req_id, + "audio_recv", + audio_count=len(recv_req.multimodal_params.audios), + waiting_queue_size=len(self.waiting_reqs), + ) self.waiting_reqs.append(recv_req) else: assert False, f"Error Req Inf {recv_req}" diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py index cbd39666a0..8db3be7f35 100644 --- a/lightllm/server/audioserver/model_infer/model_rpc.py +++ b/lightllm/server/audioserver/model_infer/model_rpc.py @@ -1,8 +1,12 @@ import asyncio -import rpyc +import inspect +import io import socket +import wave + +import numpy as np +import rpyc import torch -import inspect from typing import List from rpyc.utils.classic import obtain from rpyc.utils.server import ThreadedServer @@ -13,6 +17,21 @@ from lightllm.utils.infer_utils import set_random_seed from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient from lightllm.utils.graceful_utils import graceful_registry +from lightllm.utils.log_utils import init_logger + + +logger = init_logger(__name__) + + +def _generate_silence_wav_bytes(sample_rate: int = 16000, num_samples: int = 16000) -> bytes: + samples = np.zeros(num_samples, dtype=np.int16) + buffer = io.BytesIO() + with wave.open(buffer, "wb") as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(sample_rate) + wav_file.writeframes(samples.tobytes()) + return buffer.getvalue() class AudioModelRpcServer(rpyc.Service): @@ -72,6 +91,13 @@ def exposed_encode(self, audios): audios = obtain(audios) return self.forward(audios) + def exposed_warmup_model(self): + torch.cuda.set_device(self.dp_rank_id) + warmup_audio = _generate_silence_wav_bytes() + self.model.warmup(warmup_audio) + logger.info(f"audio model warmup finished on dp_rank_id:{self.dp_rank_id}") + return + class AudioModelRpcClient: def __init__(self, model_rpc, world_size, rpc_server_process=None): @@ -94,9 +120,11 @@ async def _func(*args, **kwargs): self._init_model = async_wrap(self.model.init_model) self._encode = async_wrap(self.model.encode) + self._warmup_model = async_wrap(self.model.warmup_model) else: self._init_model = self.model.exposed_init_model self._encode = self.model.exposed_encode + self._warmup_model = self.model.exposed_warmup_model return async def init_model(self, kvargs): @@ -111,6 +139,12 @@ async def encode(self, audios: List[AudioItem]): return await ans return ans + async def warmup_model(self): + ans = self._warmup_model() + if self.use_rpc: + return await ans + return ans + def _init_env(port, device_id): graceful_registry(inspect.currentframe().f_code.co_name) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index e28e4c93ad..3a818b0a39 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -124,6 +124,13 @@ def __init__( self.latest_success_infer_time_mark.set_value(int(time.time())) return + def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str, **kwargs): + cost_ms = (time.time() - start_time) * 1000.0 + extras = " ".join(f"{k}:{v}" for k, v in kwargs.items()) + suffix = f" {extras}" if extras else "" + logger.info(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}") + return + async def _alloc_resource(self, items, md5sums, token_nums, datas): while True: @@ -287,6 +294,10 @@ async def generate( start_time = time.time() request_headers = request.headers if request is not None else {} group_request_id = self.alloc_req_id(sampling_params, is_health_req) + if request is not None: + request.state.lightllm_req_id = group_request_id + audio_count = len(multimodal_params.audios) if multimodal_params is not None else 0 + image_count = len(multimodal_params.images) if multimodal_params is not None else 0 try: original_multimodal_params = None @@ -295,11 +306,26 @@ async def generate( if self.pd_mode.is_P_or_NORMAL(): await multimodal_params.verify_and_preload(request) + self._log_stage_timing( + group_request_id, + start_time, + "verify_and_preload_done", + audio_count=audio_count, + image_count=image_count, + ) # 记录请求到达的相关信息 await self._log_req_header(request_headers, group_request_id) # encode prompt_ids = await self._encode(prompt, multimodal_params, sampling_params) + self._log_stage_timing( + group_request_id, + start_time, + "encode_done", + prompt_tokens=len(prompt_ids), + audio_count=audio_count, + image_count=image_count, + ) prompt_tokens = len(prompt_ids) # 监控 @@ -308,6 +334,13 @@ async def generate( self.metric_client.histogram_observe("lightllm_request_input_length", prompt_tokens) self.metric_client.histogram_observe("lightllm_request_max_new_tokens", sampling_params.max_new_tokens) prompt_ids = await self._check_and_repair_length(prompt_ids, sampling_params) + self._log_stage_timing( + group_request_id, + start_time, + "check_and_repair_length_done", + prompt_tokens=len(prompt_ids), + max_new_tokens=sampling_params.max_new_tokens, + ) if nixl_pd_upload_websocket is not None and not is_health_req and self.pd_mode.is_NP(): # 在 nixl pd 模式下的 p 节点, 为了更好的兼容多模态的推理流程,np 节点需要先上报其 encode 好的 prompt ids 信息,然后 @@ -355,6 +388,12 @@ async def generate( chunked_prefill_size=self.args.chunked_prefill_size, ) req_objs.append(req_obj) + self._log_stage_timing( + group_request_id, + start_time, + "shm_req_init_done", + req_count=len(req_objs), + ) logger.debug( f"alloc shm_req for req_id {group_request_id}, " @@ -368,6 +407,13 @@ async def generate( await self.transfer_to_next_module_or_node( prompt, sampling_params, original_multimodal_params, req_status.group_req_objs ) + self._log_stage_timing( + group_request_id, + start_time, + "request_forwarded", + has_audio=audio_count > 0, + has_image=image_count > 0, + ) results_generator = self._wait_to_token_package( start_time, @@ -445,7 +491,15 @@ async def _encode( ), "too many multimodal items!" if multimodal_params.audios: assert not self.args.disable_audio, "audio multimodal not enabled" + encode_start_time = time.time() await self._alloc_multimodal_resources(multimodal_params, sampling_params) + log_req_id = getattr(sampling_params, "group_request_id", None) + logger.info( + f"lightllm_req_id:{log_req_id} " + f"stage:alloc_multimodal_resources_done " + f"elapsed_ms:{(time.time() - encode_start_time) * 1000.0:.3f} " + f"audio_count:{len(multimodal_params.audios)} image_count:{len(multimodal_params.images)}" + ) prompt_ids = self.tokenizer.encode( prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens ) @@ -539,20 +593,39 @@ async def transfer_to_next_module( if self.pd_mode.is_P_or_NORMAL(): if not self.args.disable_vision: + logger.info( + f"lightllm_req_id:{group_req_objs.group_req_id} " + f"stage:transfer_to_visual " + f"target_port:{self.args.visual_port}" + ) self.send_to_visual.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL) return if not self.args.disable_audio: + logger.info( + f"lightllm_req_id:{group_req_objs.group_req_id} " + f"stage:transfer_to_audio " + f"target_port:{self.args.audio_port}" + ) self.send_to_audio.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL) return if self.args.enable_cpu_cache: + logger.info( + f"lightllm_req_id:{group_req_objs.group_req_id} " + f"stage:transfer_to_multi_level_kv_cache target_port:{self.args.multi_level_kv_cache_port}" + ) self.send_to_multi_level_kv_cache.send_pyobj( group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL, ) return + logger.info( + f"lightllm_req_id:{group_req_objs.group_req_id} " + f"stage:transfer_to_router " + f"target_port:{self.args.router_port}" + ) self.send_to_router.send_pyobj( group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL, @@ -561,6 +634,11 @@ async def transfer_to_next_module( if self.pd_mode.is_D(): # 在 D 模式下,不需要传输真的多模态参数,因为其已经被 P 处理好了 + logger.info( + f"lightllm_req_id:{group_req_objs.group_req_id} " + f"stage:transfer_to_router_from_decode " + f"target_port:{self.args.router_port}" + ) self.send_to_router.send_pyobj( group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL, @@ -619,6 +697,11 @@ async def _wait_to_token_package( first_token_cost_ms = (time.time() - start_time) * 1000 is_first_token = False self.first_time_costs.add(first_token_cost_ms) + logger.info( + f"lightllm_req_id:{group_request_id} " + f"stage:first_token_arrived elapsed_ms:{first_token_cost_ms:.3f} " + f"sub_req_id:{sub_req_id} prompt_tokens:{prompt_tokens}" + ) out_token_counter += 1 diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index 09a07455b3..cd9d652ab8 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -1,5 +1,7 @@ """Multimodal parameters for text generation.""" import os +import wave +import time import librosa import base64 from typing import List @@ -12,6 +14,17 @@ logger = init_logger(__name__) +def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float = 1.0) -> bytes: + num_samples = max(1, int(sample_rate * duration_seconds)) + with BytesIO() as buffer: + with wave.open(buffer, "wb") as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(sample_rate) + wav_file.writeframes(b"\x00\x00" * num_samples) + return buffer.getvalue() + + class AudioItem: def __init__(self, **kwargs): self._type = kwargs["type"] @@ -32,6 +45,9 @@ def __init__(self, **kwargs): async def preload(self, request: Request): try: + req_id = getattr(getattr(request, "state", None), "lightllm_req_id", None) + preload_start = time.time() + source_ready_start = preload_start if self._type == "url": timeout = int(os.getenv("REQUEST_TIMEOUT", "5")) proxy = os.getenv("REQUEST_PROXY", None) @@ -40,13 +56,22 @@ async def preload(self, request: Request): audio_data = base64.b64decode(self._data) else: raise ValueError(f"cannot read audio which type is {self._type}!") + source_ready_cost_ms = (time.time() - source_ready_start) * 1000.0 # check if valid audio bytes + decode_start = time.time() audio_values, _ = librosa.load(BytesIO(audio_data), sr=16000) + decode_cost_ms = (time.time() - decode_start) * 1000.0 from lightllm.models.whisper.defaults import MIN_AUDIO_LEN self.audio_length = max(audio_values.shape[0], MIN_AUDIO_LEN) # 如果音频过短,会被pad到480的长度 self._preload_data = audio_data + logger.info( + f"lightllm_req_id:{req_id} stage:audio_preload_done " + f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} " + f"source_type:{self._type} source_ready_ms:{source_ready_cost_ms:.3f} " + f"decode_ms:{decode_cost_ms:.3f} audio_length:{self.audio_length}" + ) return except Exception as e: @@ -184,3 +209,13 @@ def to_origin_dict(self): ret["images"] = [i.to_origin_dict() for i in self.images] ret["audios"] = [a.to_origin_dict() for a in self.audios] return ret + + +async def warmup_audio_preload(): + warmup_audio = AudioItem( + type="base64", + data=base64.b64encode(generate_silence_wav_bytes()).decode("utf-8"), + ) + await warmup_audio.preload(None) + warmup_audio.read() + return From a3872599dc98eecd98e28915e0d77f09d96e61ec Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Mon, 30 Mar 2026 00:45:08 +0000 Subject: [PATCH 04/51] add http client cache --- lightllm/utils/multimodal_utils.py | 35 +++++++++++++++++++----------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/lightllm/utils/multimodal_utils.py b/lightllm/utils/multimodal_utils.py index 14c8303273..6e3766f950 100644 --- a/lightllm/utils/multimodal_utils.py +++ b/lightllm/utils/multimodal_utils.py @@ -3,10 +3,14 @@ import httpx from PIL import Image from io import BytesIO +from urllib.parse import urlparse +from typing import Dict, Optional from fastapi import Request from lightllm.utils.log_utils import init_logger logger = init_logger(__name__) +_HTTP_CLIENTS: Dict[Optional[str], httpx.AsyncClient] = {} +_LOCAL_RESOURCE_HOSTS = {"127.0.0.1", "localhost", "::1"} def image2base64(img_str: str): @@ -21,20 +25,25 @@ def image2base64(img_str: str): async def fetch_resource(url, request: Request, timeout, proxy=None): logger.info(f"Begin to download resource from url: {url}") start_time = time.time() - async with httpx.AsyncClient(proxy=proxy) as client: - async with client.stream("GET", url, timeout=timeout) as response: - response.raise_for_status() - ans_bytes = [] - async for chunk in response.aiter_bytes(chunk_size=1024 * 1024): - if request is not None and await request.is_disconnected(): - await response.aclose() - raise Exception("Request disconnected. User cancelled download.") - ans_bytes.append(chunk) - # 接收的数据不能大于128M - if len(ans_bytes) > 128: - raise Exception(f"url {url} recv data is too big") + hostname = urlparse(url).hostname + effective_proxy = None if hostname in _LOCAL_RESOURCE_HOSTS else proxy + client = _HTTP_CLIENTS.get(effective_proxy) + if client is None: + client = httpx.AsyncClient(proxy=effective_proxy) + _HTTP_CLIENTS[effective_proxy] = client + async with client.stream("GET", url, timeout=timeout) as response: + response.raise_for_status() + ans_bytes = [] + async for chunk in response.aiter_bytes(chunk_size=1024 * 1024): + if request is not None and await request.is_disconnected(): + await response.aclose() + raise Exception("Request disconnected. User cancelled download.") + ans_bytes.append(chunk) + # 接收的数据不能大于128M + if len(ans_bytes) > 128: + raise Exception(f"url {url} recv data is too big") - content = b"".join(ans_bytes) + content = b"".join(ans_bytes) end_time = time.time() cost_time = end_time - start_time logger.info(f"Download url {url} resource cost time: {cost_time} seconds") From cd89cd613117c33a5900dc2fb2466ea2d5599797 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Mon, 30 Mar 2026 00:48:15 +0000 Subject: [PATCH 05/51] reduce polling time --- lightllm/server/audioserver/manager.py | 7 ++++++- lightllm/server/router/manager.py | 9 +++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py index b4fb002965..ac4058b643 100644 --- a/lightllm/server/audioserver/manager.py +++ b/lightllm/server/audioserver/manager.py @@ -55,6 +55,7 @@ def __init__( self.model_rpcs: List[AudioModelRpcClient] = [] self.req_stage_times: Dict[int, Dict[str, float]] = {} self.next_module_port = args.multi_level_kv_cache_port if args.enable_cpu_cache else args.router_port + self.waiting_reqs_event = asyncio.Event() def _mark_req_stage(self, req_id: int, stage: str): now = time.time() @@ -131,7 +132,10 @@ async def infer_audios(self, audios: List[AudioItem]): async def loop_for_fwd(self): while True: if len(self.waiting_reqs) == 0: - await asyncio.sleep(0.01) # 10ms + self.waiting_reqs_event.clear() + if len(self.waiting_reqs) == 0: + await self.waiting_reqs_event.wait() + continue else: processing_group_reqs = [] audios_need_infer = [] @@ -249,6 +253,7 @@ async def loop_for_netio_req(self): waiting_queue_size=len(self.waiting_reqs), ) self.waiting_reqs.append(recv_req) + self.waiting_reqs_event.set() else: assert False, f"Error Req Inf {recv_req}" diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py index 0d2705fab2..f5e0b8df9a 100644 --- a/lightllm/server/router/manager.py +++ b/lightllm/server/router/manager.py @@ -316,8 +316,7 @@ async def _add_batch(self, batch: Batch): # 添加新请求 reqs = [r.to_router_rpc_obj() for r in batch.reqs] while not self.shm_reqs_io_buffer.is_empty(): - await asyncio.sleep(0.02) - + await asyncio.sleep(0.001) self.shm_reqs_io_buffer.write_obj(reqs) self.shm_reqs_io_buffer.set_ready() logger.debug(f"Prefill Batch: {batch.simple_log()} \n") @@ -326,8 +325,7 @@ async def _add_batch(self, batch: Batch): async def _aborted_reqs(self, aborted_reqs: List[Req]): cmds = [AbortedReqCmd(req_id=r.request_id) for r in aborted_reqs] while not self.shm_reqs_io_buffer.is_empty(): - await asyncio.sleep(0.02) - + await asyncio.sleep(0.001) self.shm_reqs_io_buffer.write_obj(cmds) self.shm_reqs_io_buffer.set_ready() return @@ -335,8 +333,7 @@ async def _aborted_reqs(self, aborted_reqs: List[Req]): async def _stop_str_matched_reqs(self, stop_str_matched_reqs: List[Req]): cmds = [StopStrMatchedReqCmd(req_id=r.request_id) for r in stop_str_matched_reqs] while not self.shm_reqs_io_buffer.is_empty(): - await asyncio.sleep(0.02) - + await asyncio.sleep(0.001) self.shm_reqs_io_buffer.write_obj(cmds) self.shm_reqs_io_buffer.set_ready() return From 4788980006dc0be673e151d0c9c8f4cf12afcfdf Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Mon, 30 Mar 2026 01:27:53 +0000 Subject: [PATCH 06/51] Optimize audio shm payload handling and cache lookups --- .../qwen3_omni_audio.py | 47 ++++++++++++------- lightllm/models/whisper/whisper_audio.py | 20 ++------ .../embed_cache/impl/naive_memory_cache.py | 2 + lightllm/server/httpserver/manager.py | 21 ++++----- lightllm/server/multimodal_params.py | 37 ++++++++++++++- 5 files changed, 82 insertions(+), 45 deletions(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py index 6c620448b9..424a768bbf 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py @@ -1,6 +1,7 @@ import os import json import math +import time import torch import rpyc import librosa @@ -10,16 +11,18 @@ from safetensors import safe_open from torch.nn import functional as F from typing import Callable, Optional, Union, List -from rpyc.utils.classic import obtain - from transformers.activations import ACT2FN -from lightllm.server.multimodal_params import AudioItem +from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor +from lightllm.utils.log_utils import init_logger + + +logger = init_logger(__name__) def _get_feat_extract_output_lengths(input_lengths): @@ -338,6 +341,11 @@ def forward( return hidden_states def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedCacheClient): + encode_start = time.time() + load_shm_cost = 0.0 + preprocess_cost = 0.0 + forward_cost = 0.0 + cache_copy_cost = 0.0 uuids = [] items: List[AudioItem] = [] per_audio_features: List[torch.Tensor] = [] @@ -345,12 +353,14 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC if isinstance(item, AudioItem): uuids.append(item.uuid) items.append(item) + load_start = time.time() audio_data = read_shm(get_shm_name_data(item.uuid)) - audio = BytesIO(audio_data) - audio, _ = librosa.load(audio, sr=self.processor.sampling_rate) + audio = load_audio_from_shm_payload(audio_data, item.extra_params, self.processor.sampling_rate) + load_shm_cost += time.time() - load_start else: raise ValueError(f"cannot read audio which type is {type(item)}!") + preprocess_start = time.time() input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True) if feature_attention_mask is not None: audio_feature_lengths = torch.sum(feature_attention_mask, dim=1) @@ -361,22 +371,19 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC feature_lens = ( audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1) ) + preprocess_cost += time.time() - preprocess_start + forward_start = time.time() audio_features = self.forward( input_features, feature_lens=feature_lens, ) + forward_cost += time.time() - forward_start per_audio_features.append(audio_features) - ready_audio = obtain(self.cache_client.root.get_items_embed(uuids)) - ids_to_set = [] - for i, ready in enumerate(ready_audio): - if ready: - continue - - uid = uuids[i] + cache_copy_start = time.time() + for i, uid in enumerate(uuids): item = items[i] - cur_embed = per_audio_features[i] cpu_embed_cache_client.copy_to_cache( embed_tensor=cur_embed, start_index_in_cache=item.start_index_in_embed_cache @@ -384,11 +391,19 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC assert ( item.token_num == cur_embed.shape[0] ), f"audio token num not match {item.token_num} vs {cur_embed.shape[0]} " - ids_to_set.append(uid) - if ids_to_set: - self.cache_client.root.set_items_embed(ids=ids_to_set) + if uuids: torch.cuda.current_stream().synchronize() + self.cache_client.root.set_items_embed(ids=uuids) + cache_copy_cost += time.time() - cache_copy_start + logger.info( + f"audio_encode_batch_done audio_count:{len(audio_items)} " + f"load_shm_ms:{load_shm_cost * 1000.0:.3f} " + f"preprocess_ms:{preprocess_cost * 1000.0:.3f} " + f"forward_ms:{forward_cost * 1000.0:.3f} " + f"cache_ms:{cache_copy_cost * 1000.0:.3f} " + f"elapsed_ms:{(time.time() - encode_start) * 1000.0:.3f}" + ) @torch.no_grad() def warmup(self, audio_bytes: bytes): diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py index 0493afdb9a..a94d22dd0c 100644 --- a/lightllm/models/whisper/whisper_audio.py +++ b/lightllm/models/whisper/whisper_audio.py @@ -10,8 +10,7 @@ from safetensors.torch import load_file from transformers.processing_utils import ProcessorMixin from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data -from lightllm.server.multimodal_params import AudioItem -from rpyc.utils.classic import obtain +from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient # tokenizer_class removed @@ -175,8 +174,7 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC uuids.append(item.uuid) items.append(item) audio_data = read_shm(get_shm_name_data(item.uuid)) - audio = BytesIO(audio_data) - audio, _ = librosa.load(audio, sr=16000) + audio = load_audio_from_shm_payload(audio_data, item.extra_params, 16000) else: raise ValueError(f"cannot read audio which type is {type(item)}!") @@ -222,25 +220,17 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC continue per_audio_embeds[owner].append(audios[chunk_idx][:token_len]) - ready_audio = obtain(self.cache_client.root.get_items_embed(uuids)) - ids_to_set = [] - for i, ready in enumerate(ready_audio): - if ready: - continue - - uid = uuids[i] + for i, uid in enumerate(uuids): item = items[i] - # 拼接该 audio 的所有 chunk embedding cur_embed = torch.cat(per_audio_embeds[i], dim=0) cpu_embed_cache_client.copy_to_cache( embed_tensor=cur_embed, start_index_in_cache=item.start_index_in_embed_cache ) - ids_to_set.append(uid) - if ids_to_set: - self.cache_client.root.set_items_embed(ids=ids_to_set) + if uuids: torch.cuda.current_stream().synchronize() + self.cache_client.root.set_items_embed(ids=uuids) @torch.no_grad() def warmup(self, audio_bytes: bytes): diff --git a/lightllm/server/embed_cache/impl/naive_memory_cache.py b/lightllm/server/embed_cache/impl/naive_memory_cache.py index 5ad26fbcc8..ff7b2374b2 100644 --- a/lightllm/server/embed_cache/impl/naive_memory_cache.py +++ b/lightllm/server/embed_cache/impl/naive_memory_cache.py @@ -205,6 +205,8 @@ def alloc(self, md5sum_list: list[str], token_num_list: list[int]) -> Optional[l "token_id": rec.token_id, "start_index_in_embed_cache": rec.mem_block.start, "token_num": rec.token_num, + "data_ready": rec.data, + "embed_ready": rec.embed, } ) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 3a818b0a39..8b3be9b0e8 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -144,23 +144,17 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas): logger.error(str(records) + "and try to set --embed_cache_storage_size bigger") raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger") - uid_list = [] - for item, rec in zip(items, records): + update_data_ids = [] + for item, rec, data in zip(items, records, datas): item: Union[ImageItem, AudioItem] = item item.uuid = rec["id"] item.token_id = rec["token_id"] item.token_num = rec["token_num"] item.start_index_in_embed_cache = rec["start_index_in_embed_cache"] - uid_list.append(rec["id"]) - - ready_flags = obtain(self.cache_client.root.get_items_data(uid_list)) - update_data_ids = [] - - for uid, ready, data in zip(uid_list, ready_flags, datas): - if not ready: - create_shm(get_shm_name_data(uid), data) - update_data_ids.append(uid) + if not rec["data_ready"]: + create_shm(get_shm_name_data(rec["id"]), data) + update_data_ids.append(rec["id"]) if update_data_ids: self.cache_client.root.set_items_data(update_data_ids) @@ -188,7 +182,10 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params) data = audio.read() token_num = self.tokenizer.get_audio_token_length(audio) - md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(audio.extra_params))) + payload_md5 = audio.extra_params.get("audio_payload_md5") + if payload_md5 is None: + payload_md5 = hashlib.md5(data).hexdigest() + md5sum = payload_md5 + "_" + str(hash(frozendict(audio.extra_params))) md5sums.append(md5sum) tokens_nums.append(token_num) datas.append(data) diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index cd9d652ab8..13a26d9b57 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -4,6 +4,8 @@ import time import librosa import base64 +import hashlib +import numpy as np from typing import List from io import BytesIO from PIL import Image @@ -12,6 +14,9 @@ from lightllm.utils.log_utils import init_logger logger = init_logger(__name__) +RAW_AUDIO_SHM_FORMAT = "raw_audio_bytes" +WAVEFORM_F32_SHM_FORMAT = "waveform_f32" +AUDIO_SHM_USE_RAW_ENV = "LIGHTLLM_AUDIO_SHM_USE_RAW" def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float = 1.0) -> bytes: @@ -25,6 +30,22 @@ def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float return buffer.getvalue() +def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray: + audio_shm_format = extra_params.get("audio_shm_format", RAW_AUDIO_SHM_FORMAT) + if audio_shm_format == WAVEFORM_F32_SHM_FORMAT: + num_samples = int(extra_params.get("audio_num_samples", 0)) + if num_samples > 0: + return np.frombuffer(audio_data, dtype=np.float32, count=num_samples) + return np.frombuffer(audio_data, dtype=np.float32) + + audio, _ = librosa.load(BytesIO(audio_data), sr=sample_rate) + return np.asarray(audio, dtype=np.float32) + + +def should_use_raw_audio_shm() -> bool: + return os.getenv(AUDIO_SHM_USE_RAW_ENV, "0") == "1" + + class AudioItem: def __init__(self, **kwargs): self._type = kwargs["type"] @@ -61,16 +82,28 @@ async def preload(self, request: Request): # check if valid audio bytes decode_start = time.time() audio_values, _ = librosa.load(BytesIO(audio_data), sr=16000) + audio_values = np.asarray(audio_values, dtype=np.float32) decode_cost_ms = (time.time() - decode_start) * 1000.0 from lightllm.models.whisper.defaults import MIN_AUDIO_LEN self.audio_length = max(audio_values.shape[0], MIN_AUDIO_LEN) # 如果音频过短,会被pad到480的长度 - self._preload_data = audio_data + if should_use_raw_audio_shm(): + self.extra_params["audio_shm_format"] = RAW_AUDIO_SHM_FORMAT + self.extra_params.pop("audio_sample_rate", None) + self.extra_params.pop("audio_num_samples", None) + self._preload_data = audio_data + else: + self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT + self.extra_params["audio_sample_rate"] = 16000 + self.extra_params["audio_num_samples"] = int(audio_values.shape[0]) + self._preload_data = audio_values.tobytes() + self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest() logger.info( f"lightllm_req_id:{req_id} stage:audio_preload_done " f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} " f"source_type:{self._type} source_ready_ms:{source_ready_cost_ms:.3f} " - f"decode_ms:{decode_cost_ms:.3f} audio_length:{self.audio_length}" + f"decode_ms:{decode_cost_ms:.3f} audio_length:{self.audio_length} " + f"shm_format:{self.extra_params['audio_shm_format']}" ) return From 7b05403af6df9f42d294c5b28ee76fd7c4b89342 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Mon, 30 Mar 2026 01:31:21 +0000 Subject: [PATCH 07/51] cache hann_window/mel_filters --- .../qwen3_omni_moe_thinker/audio_process.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py index 833cc8f4b0..e9dc931886 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py +++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py @@ -46,10 +46,25 @@ def __init__( norm="slaney", mel_scale="slaney", ) + self._hann_window_cache = {} + self._mel_filters_cache = {} + + def _get_cached_feature_tensors(self, device: Union[str, torch.device]): + device_key = str(device) + window = self._hann_window_cache.get(device_key) + if window is None: + window = torch.hann_window(self.n_fft, device=device) + self._hann_window_cache[device_key] = window + + mel_filters = self._mel_filters_cache.get(device_key) + if mel_filters is None: + mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32) + self._mel_filters_cache[device_key] = mel_filters + return window, mel_filters def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu") -> np.ndarray: waveform = torch.from_numpy(waveform).to(device, torch.float32) - window = torch.hann_window(self.n_fft, device=device) + window, mel_filters = self._get_cached_feature_tensors(device) if self.dither != 0.0: waveform += self.dither * torch.randn(waveform.shape, dtype=waveform.dtype, device=waveform.device) @@ -57,7 +72,6 @@ def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True) magnitudes = stft[..., :-1].abs() ** 2 - mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32) mel_spec = mel_filters.T @ magnitudes log_spec = torch.clamp(mel_spec, min=1e-10).log10() From 713c45d912aec4b6955aaf6e55be0ef8e5705dd6 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Mon, 30 Mar 2026 08:26:52 +0000 Subject: [PATCH 08/51] Fix audio preload config to follow tokenizer settings --- .../common/basemodel/multimodal_tokenizer.py | 1 + lightllm/models/internvl/model.py | 5 +++++ .../models/qwen3_omni_moe_thinker/model.py | 5 +++++ lightllm/server/api_http.py | 4 +++- lightllm/server/httpserver/manager.py | 4 +++- lightllm/server/multimodal_params.py | 20 +++++++++++-------- 6 files changed, 29 insertions(+), 10 deletions(-) diff --git a/lightllm/common/basemodel/multimodal_tokenizer.py b/lightllm/common/basemodel/multimodal_tokenizer.py index cdcbd7f089..872a418bf7 100644 --- a/lightllm/common/basemodel/multimodal_tokenizer.py +++ b/lightllm/common/basemodel/multimodal_tokenizer.py @@ -33,6 +33,7 @@ class BaseMultiModalTokenizer(ABC): def __init__(self, tokenizer, **kwargs): self.tokenizer = tokenizer + self.audio_preload_config = None def __getattr__(self, name): obj_dict = object.__getattribute__(self, "__dict__") diff --git a/lightllm/models/internvl/model.py b/lightllm/models/internvl/model.py index ccb76d3512..70c797aeb8 100644 --- a/lightllm/models/internvl/model.py +++ b/lightllm/models/internvl/model.py @@ -50,6 +50,11 @@ def __init__(self, tokenizer, model_cfg, **kwargs): self.audio_min_length = MIN_AUDIO_LEN self.audio_max_length = 16000 * 30 + self.audio_preload_config = { + "sampling_rate": 16000, + "hop_length": 160, + "min_audio_len": int(self.audio_min_length), + } def init_imageitem_extral_params( self, img: ImageItem, multi_params: MultimodalParams, sampling_params: SamplingParams diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py index a1419f83ef..4a5131bbf1 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/model.py +++ b/lightllm/models/qwen3_omni_moe_thinker/model.py @@ -42,6 +42,11 @@ def __init__(self, tokenizer=None, processor=None, **kwargs): self.sampling_rate = self.audio_processor.sampling_rate self.n_samples = self.audio_processor.n_samples self.hop_length = self.audio_processor.hop_length + self.audio_preload_config = { + "sampling_rate": int(self.sampling_rate), + "hop_length": int(self.hop_length), + "min_audio_len": int(MIN_AUDIO_LEN), + } self.image_start_id = kwargs["model_cfg"]["vision_start_token_id"] self.image_end_id = kwargs["model_cfg"]["vision_end_token_id"] diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py index 6be738befc..cb7619fbe5 100755 --- a/lightllm/server/api_http.py +++ b/lightllm/server/api_http.py @@ -272,7 +272,9 @@ async def tokens(request: Request): multimodal_params_dict = request_dict.get("multimodal_params", {}) multimodal_params = MultimodalParams(**multimodal_params_dict) - await multimodal_params.verify_and_preload(request) + await multimodal_params.verify_and_preload( + request, audio_preload_config=getattr(g_objs.httpserver_manager.tokenizer, "audio_preload_config", None) + ) return JSONResponse( { "ntokens": g_objs.httpserver_manager.tokens( diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 8b3be9b0e8..9a6864774a 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -302,7 +302,9 @@ async def generate( original_multimodal_params = copy.deepcopy(multimodal_params) if self.pd_mode.is_P_or_NORMAL(): - await multimodal_params.verify_and_preload(request) + await multimodal_params.verify_and_preload( + request, audio_preload_config=getattr(self.tokenizer, "audio_preload_config", None) + ) self._log_stage_timing( group_request_id, start_time, diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index 13a26d9b57..440bff06c5 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -17,6 +17,8 @@ RAW_AUDIO_SHM_FORMAT = "raw_audio_bytes" WAVEFORM_F32_SHM_FORMAT = "waveform_f32" AUDIO_SHM_USE_RAW_ENV = "LIGHTLLM_AUDIO_SHM_USE_RAW" +DEFAULT_AUDIO_SAMPLE_RATE = 16000 +DEFAULT_MIN_AUDIO_LEN = 480 def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float = 1.0) -> bytes: @@ -64,7 +66,7 @@ def __init__(self, **kwargs): self._preload_data = None self.extra_params = {} - async def preload(self, request: Request): + async def preload(self, request: Request, audio_preload_config: dict = None): try: req_id = getattr(getattr(request, "state", None), "lightllm_req_id", None) preload_start = time.time() @@ -79,14 +81,16 @@ async def preload(self, request: Request): raise ValueError(f"cannot read audio which type is {self._type}!") source_ready_cost_ms = (time.time() - source_ready_start) * 1000.0 + audio_preload_config = audio_preload_config or {} + target_sample_rate = int(audio_preload_config.get("sampling_rate", DEFAULT_AUDIO_SAMPLE_RATE)) + min_audio_len = int(audio_preload_config.get("min_audio_len", DEFAULT_MIN_AUDIO_LEN)) + # check if valid audio bytes decode_start = time.time() - audio_values, _ = librosa.load(BytesIO(audio_data), sr=16000) + audio_values, _ = librosa.load(BytesIO(audio_data), sr=target_sample_rate) audio_values = np.asarray(audio_values, dtype=np.float32) decode_cost_ms = (time.time() - decode_start) * 1000.0 - from lightllm.models.whisper.defaults import MIN_AUDIO_LEN - - self.audio_length = max(audio_values.shape[0], MIN_AUDIO_LEN) # 如果音频过短,会被pad到480的长度 + self.audio_length = max(audio_values.shape[0], min_audio_len) if should_use_raw_audio_shm(): self.extra_params["audio_shm_format"] = RAW_AUDIO_SHM_FORMAT self.extra_params.pop("audio_sample_rate", None) @@ -94,7 +98,7 @@ async def preload(self, request: Request): self._preload_data = audio_data else: self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT - self.extra_params["audio_sample_rate"] = 16000 + self.extra_params["audio_sample_rate"] = target_sample_rate self.extra_params["audio_num_samples"] = int(audio_values.shape[0]) self._preload_data = audio_values.tobytes() self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest() @@ -221,11 +225,11 @@ def __init__( self.audios = [AudioItem(**a) for a in audios] return - async def verify_and_preload(self, request: Request): + async def verify_and_preload(self, request: Request, audio_preload_config: dict = None): for image in self.images: await image.preload(request) for audio in self.audios: - await audio.preload(request) + await audio.preload(request, audio_preload_config=audio_preload_config) return def to_dict(self): From 65a3ec67bb94bd41b604e415f2e227ae35c81ef9 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Tue, 31 Mar 2026 09:27:15 +0000 Subject: [PATCH 09/51] Optimize qwen3 omni audio preprocessing fast path --- .../qwen3_omni_moe_thinker/audio_process.py | 23 +++++++++++ .../qwen3_omni_audio.py | 41 +++++++++++-------- lightllm/server/multimodal_params.py | 16 +++++++- 3 files changed, 61 insertions(+), 19 deletions(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py index e9dc931886..42eae8edb5 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py +++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py @@ -105,6 +105,29 @@ def zero_mean_unit_var_norm( return normed_input_values + def _preprocess_single_padded( + self, + raw_speech: np.ndarray, + num_frames: int, + device: Optional[str] = "cpu", + ) -> Tuple[torch.Tensor, torch.Tensor]: + waveform = np.asarray(raw_speech, dtype=np.float32) + if waveform.ndim != 1: + raise ValueError(f"single audio fast path expects 1D waveform, got shape={waveform.shape}") + + extracted = self._torch_extract_fbank_features(waveform[None, :], device) + extracted = np.asarray(extracted, dtype=np.float32) + if extracted.ndim != 3: + raise ValueError(f"unexpected extracted feature shape={extracted.shape}") + + if extracted.shape[-1] < num_frames: + raise ValueError(f"feature frames {extracted.shape[-1]} < requested num_frames {num_frames}") + + compact_features = torch.from_numpy(extracted[:, :, :num_frames]).to(device="cuda", dtype=torch.bfloat16) + compact_features = compact_features[0].contiguous() + feature_lens = torch.tensor([num_frames], device="cuda", dtype=torch.long) + return compact_features, feature_lens + def _preprocess( self, raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]], diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py index 424a768bbf..f3cd0525eb 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py @@ -13,7 +13,7 @@ from typing import Callable, Optional, Union, List from transformers.activations import ACT2FN -from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload +from lightllm.server.multimodal_params import AudioItem, WAVEFORM_F32_SHM_FORMAT, load_audio_from_shm_payload from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data from lightllm.server.embed_cache.embed_cache_client import CpuEmbedCacheClient from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager @@ -356,21 +356,27 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC load_start = time.time() audio_data = read_shm(get_shm_name_data(item.uuid)) audio = load_audio_from_shm_payload(audio_data, item.extra_params, self.processor.sampling_rate) + audio_num_frames = item.extra_params.get("audio_num_frames") load_shm_cost += time.time() - load_start else: raise ValueError(f"cannot read audio which type is {type(item)}!") preprocess_start = time.time() - input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True) - if feature_attention_mask is not None: - audio_feature_lengths = torch.sum(feature_attention_mask, dim=1) - input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0) + if audio_num_frames is not None and item.extra_params.get("audio_shm_format") == WAVEFORM_F32_SHM_FORMAT: + input_features, feature_lens = self.processor._preprocess_single_padded( + audio, int(audio_num_frames), device="cpu" + ) else: - audio_feature_lengths = None - - feature_lens = ( - audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1) - ) + input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True) + if feature_attention_mask is not None: + audio_feature_lengths = torch.sum(feature_attention_mask, dim=1) + input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0) + else: + audio_feature_lengths = None + + feature_lens = ( + audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1) + ) preprocess_cost += time.time() - preprocess_start forward_start = time.time() @@ -409,14 +415,13 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC def warmup(self, audio_bytes: bytes): audio = BytesIO(audio_bytes) audio, _ = librosa.load(audio, sr=self.processor.sampling_rate) - input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True) - if feature_attention_mask is not None: - audio_feature_lengths = torch.sum(feature_attention_mask, dim=1) - input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0) - else: - audio_feature_lengths = None - - feature_lens = audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1) + num_frames = (max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length + padded_len = ((max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length) * ( + self.processor.hop_length + ) + if padded_len > audio.shape[0]: + audio = np.pad(audio, (0, padded_len - audio.shape[0]), mode="constant", constant_values=0.0) + input_features, feature_lens = self.processor._preprocess_single_padded(audio, num_frames, device="cpu") _ = self.forward( input_features, feature_lens=feature_lens, diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index 440bff06c5..da5d239c6a 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -18,6 +18,7 @@ WAVEFORM_F32_SHM_FORMAT = "waveform_f32" AUDIO_SHM_USE_RAW_ENV = "LIGHTLLM_AUDIO_SHM_USE_RAW" DEFAULT_AUDIO_SAMPLE_RATE = 16000 +DEFAULT_AUDIO_HOP_LENGTH = 160 DEFAULT_MIN_AUDIO_LEN = 480 @@ -83,6 +84,7 @@ async def preload(self, request: Request, audio_preload_config: dict = None): audio_preload_config = audio_preload_config or {} target_sample_rate = int(audio_preload_config.get("sampling_rate", DEFAULT_AUDIO_SAMPLE_RATE)) + hop_length = int(audio_preload_config.get("hop_length", DEFAULT_AUDIO_HOP_LENGTH)) min_audio_len = int(audio_preload_config.get("min_audio_len", DEFAULT_MIN_AUDIO_LEN)) # check if valid audio bytes @@ -90,16 +92,28 @@ async def preload(self, request: Request, audio_preload_config: dict = None): audio_values, _ = librosa.load(BytesIO(audio_data), sr=target_sample_rate) audio_values = np.asarray(audio_values, dtype=np.float32) decode_cost_ms = (time.time() - decode_start) * 1000.0 - self.audio_length = max(audio_values.shape[0], min_audio_len) + effective_audio_len = max(audio_values.shape[0], min_audio_len) + padded_audio_len = ((effective_audio_len + hop_length - 1) // hop_length) * hop_length + if padded_audio_len > audio_values.shape[0]: + audio_values = np.pad( + audio_values, + (0, padded_audio_len - audio_values.shape[0]), + mode="constant", + constant_values=0.0, + ) + + self.audio_length = effective_audio_len if should_use_raw_audio_shm(): self.extra_params["audio_shm_format"] = RAW_AUDIO_SHM_FORMAT self.extra_params.pop("audio_sample_rate", None) self.extra_params.pop("audio_num_samples", None) + self.extra_params.pop("audio_num_frames", None) self._preload_data = audio_data else: self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT self.extra_params["audio_sample_rate"] = target_sample_rate self.extra_params["audio_num_samples"] = int(audio_values.shape[0]) + self.extra_params["audio_num_frames"] = int((effective_audio_len + hop_length - 1) // hop_length) self._preload_data = audio_values.tobytes() self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest() logger.info( From 2e480081b77a6166a89d908ad5a465e3eaefe0fd Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Tue, 31 Mar 2026 09:27:58 +0000 Subject: [PATCH 10/51] Add audio server fast path for single pending requests --- lightllm/server/audioserver/manager.py | 32 ++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py index ac4058b643..d54856c265 100644 --- a/lightllm/server/audioserver/manager.py +++ b/lightllm/server/audioserver/manager.py @@ -167,6 +167,38 @@ async def loop_for_fwd(self): else: ready_audio = obtain(self.cache_client.root.get_items_embed(audio_uuids)) + pending_audios = [audio for audio, ready in zip(multimodal_params.audios, ready_audio) if not ready] + if ( + pending_audios + and len(processing_group_reqs) == 0 + and len(self.waiting_reqs) == 0 + and len(pending_audios) < self.infer_batch_size + ): + logger.info( + f"audio_batch_ready req_ids:[{group_req_indexes.group_req_id}] " + f"audio_count:{len(pending_audios)} infer_batch_size:{self.infer_batch_size} fast_path:1" + ) + self._log_req_stage( + group_req_indexes.group_req_id, + "audio_infer_start", + batch_audio_count=len(pending_audios), + ) + await self.infer_audios(pending_audios) + self._log_req_stage( + group_req_indexes.group_req_id, + "audio_infer_done", + batch_audio_count=len(pending_audios), + ) + self._log_req_stage( + group_req_indexes.group_req_id, + "audio_send_to_next_module", + target_port=self.next_module_port, + fast_path=1, + ) + self.send_to_next_module.send_pyobj(group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL) + self._cleanup_req_stage(group_req_indexes.group_req_id) + continue + current_req_has_pending_audio = False for audio, ready in zip(multimodal_params.audios, ready_audio): if not ready: From 456a71aab0722a646945b5154d02a14420fd14a2 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Wed, 1 Apr 2026 02:46:02 +0000 Subject: [PATCH 11/51] fix num_frames --- lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 2 +- lightllm/server/multimodal_params.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py index f3cd0525eb..04839e0ce8 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py @@ -415,7 +415,7 @@ def encode(self, audio_items: List[AudioItem], cpu_embed_cache_client: CpuEmbedC def warmup(self, audio_bytes: bytes): audio = BytesIO(audio_bytes) audio, _ = librosa.load(audio, sr=self.processor.sampling_rate) - num_frames = (max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length + num_frames = max(audio.shape[0], 480) // self.processor.hop_length padded_len = ((max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length) * ( self.processor.hop_length ) diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index da5d239c6a..ad70443ca7 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -113,7 +113,7 @@ async def preload(self, request: Request, audio_preload_config: dict = None): self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT self.extra_params["audio_sample_rate"] = target_sample_rate self.extra_params["audio_num_samples"] = int(audio_values.shape[0]) - self.extra_params["audio_num_frames"] = int((effective_audio_len + hop_length - 1) // hop_length) + self.extra_params["audio_num_frames"] = int(effective_audio_len // hop_length) self._preload_data = audio_values.tobytes() self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest() logger.info( From 479367d3466aa582fb920abef62c7de9adac2abc Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Thu, 2 Apr 2026 01:55:59 +0000 Subject: [PATCH 12/51] tune fp8 --- ...8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json | 92 ++++++++++++++++ ...8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json | 92 ++++++++++++++++ ...p8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json | 101 ++++++++++++++++++ ...p8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json | 101 ++++++++++++++++++ .../{topk_num=8}_NVIDIA_GeForce_RTX_5090.json | 46 ++++++++ ...6,topk_num=8}_NVIDIA_GeForce_RTX_5090.json | 68 ++++++++++++ ...6,topk_num=8}_NVIDIA_GeForce_RTX_5090.json | 62 +++++++++++ ...orch.float16}_NVIDIA_GeForce_RTX_5090.json | 42 ++++++++ ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json | 46 ++++++++ ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json | 90 ++++++++++++++++ ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json | 90 ++++++++++++++++ ...orch.float16}_NVIDIA_GeForce_RTX_5090.json | 62 +++++++++++ ...rch.bfloat16}_NVIDIA_GeForce_RTX_5090.json | 68 ++++++++++++ 13 files changed, 960 insertions(+) create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..2a46877c76 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.float16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,92 @@ +{ + "1024": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "16384": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "512": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 4, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "800": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 5, + "num_warps": 4 + }, + "8192": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..7372d5c322 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.float16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,92 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "NEED_TRANS": false, + "num_stages": 5, + "num_warps": 4 + }, + "100": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "GROUP_SIZE_M": 16, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 5, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 4, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "64": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 5, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 5, + "num_warps": 8 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..569382ce2f --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=2048,N=768,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,101 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "100": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "NEED_TRANS": true, + "num_stages": 4, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": true, + "num_stages": 2, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "NEED_TRANS": true, + "num_stages": 3, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "NEED_TRANS": true, + "num_stages": 3, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": true, + "num_stages": 3, + "num_warps": 8 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..1456fd0b4b --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/grouped_matmul:v1/{K=384,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,101 @@ +{ + "1024": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 64, + "NEED_TRANS": true, + "num_stages": 4, + "num_warps": 8 + }, + "128": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "16384": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 64, + "NEED_TRANS": true, + "num_stages": 3, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "32768": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 3, + "num_warps": 8 + }, + "512": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 64, + "NEED_TRANS": false, + "num_stages": 4, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "NEED_TRANS": true, + "num_stages": 3, + "num_warps": 4 + }, + "800": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 64, + "NEED_TRANS": true, + "num_stages": 4, + "num_warps": 4 + }, + "8192": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "NEED_TRANS": false, + "num_stages": 2, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..0f5983241f --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_align_fused:v1/{topk_num=8}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,46 @@ +{ + "1": { + "BLOCK_SIZE": 128, + "num_warps": 1 + }, + "100": { + "BLOCK_SIZE": 256, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE": 128, + "num_warps": 8 + }, + "16": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE": 128, + "num_warps": 8 + }, + "4096": { + "BLOCK_SIZE": 128, + "num_warps": 8 + }, + "64": { + "BLOCK_SIZE": 128, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE": 256, + "num_warps": 8 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..3612e98183 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,68 @@ +{ + "1": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 2, + "num_warps": 8 + }, + "100": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 1 + }, + "128": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 2, + "num_warps": 8 + }, + "16": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 8 + }, + "2048": { + "BLOCK_DIM": 128, + "BLOCK_M": 2, + "NUM_STAGE": 1, + "num_warps": 1 + }, + "256": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 2 + }, + "32": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 4 + }, + "4096": { + "BLOCK_DIM": 128, + "BLOCK_M": 1, + "NUM_STAGE": 4, + "num_warps": 4 + }, + "64": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 2 + }, + "8": { + "BLOCK_DIM": 1024, + "BLOCK_M": 1, + "NUM_STAGE": 4, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..ff46525471 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.float16,topk_num=8}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,62 @@ +{ + "1": { + "BLOCK_DIM": 128, + "BLOCK_M": 16, + "NUM_STAGE": 1, + "num_warps": 1 + }, + "100": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 2, + "num_warps": 16 + }, + "1024": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 1 + }, + "128": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 1 + }, + "16": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 2, + "num_warps": 16 + }, + "2048": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 8 + }, + "256": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 2 + }, + "32": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 1 + }, + "64": { + "BLOCK_DIM": 512, + "BLOCK_M": 1, + "NUM_STAGE": 1, + "num_warps": 8 + }, + "8": { + "BLOCK_DIM": 256, + "BLOCK_M": 1, + "NUM_STAGE": 2, + "num_warps": 16 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..e3eb000004 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=1,Q_HEAD_NUM=8,dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,42 @@ +{ + "1": { + "num_stages": 1, + "num_warps": 1 + }, + "100": { + "num_stages": 2, + "num_warps": 1 + }, + "1024": { + "num_stages": 5, + "num_warps": 2 + }, + "128": { + "num_stages": 4, + "num_warps": 1 + }, + "16": { + "num_stages": 1, + "num_warps": 1 + }, + "2048": { + "num_stages": 4, + "num_warps": 1 + }, + "256": { + "num_stages": 2, + "num_warps": 4 + }, + "32": { + "num_stages": 5, + "num_warps": 1 + }, + "64": { + "num_stages": 5, + "num_warps": 1 + }, + "8": { + "num_stages": 1, + "num_warps": 1 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..9d20b4ea6b --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/mrope_triton_fused:v1/{HEAD_DIM=128,K_HEAD_NUM=2,Q_HEAD_NUM=16,dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,46 @@ +{ + "1": { + "num_stages": 4, + "num_warps": 2 + }, + "100": { + "num_stages": 1, + "num_warps": 1 + }, + "1024": { + "num_stages": 5, + "num_warps": 2 + }, + "128": { + "num_stages": 2, + "num_warps": 4 + }, + "16": { + "num_stages": 5, + "num_warps": 4 + }, + "2048": { + "num_stages": 3, + "num_warps": 2 + }, + "256": { + "num_stages": 2, + "num_warps": 2 + }, + "32": { + "num_stages": 4, + "num_warps": 1 + }, + "4096": { + "num_stages": 3, + "num_warps": 2 + }, + "64": { + "num_stages": 3, + "num_warps": 4 + }, + "8": { + "num_stages": 4, + "num_warps": 2 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..fdb476db92 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2048,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,90 @@ +{ + "1": { + "BLOCK_K": 128, + "BLOCK_M": 8, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 4 + }, + "100": { + "BLOCK_K": 64, + "BLOCK_M": 32, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 8 + }, + "1024": { + "BLOCK_K": 64, + "BLOCK_M": 64, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 2, + "num_warps": 4 + }, + "128": { + "BLOCK_K": 128, + "BLOCK_M": 32, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 5, + "num_warps": 4 + }, + "16": { + "BLOCK_K": 256, + "BLOCK_M": 8, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 8 + }, + "2048": { + "BLOCK_K": 128, + "BLOCK_M": 64, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 4 + }, + "256": { + "BLOCK_K": 128, + "BLOCK_M": 64, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 8 + }, + "32": { + "BLOCK_K": 128, + "BLOCK_M": 16, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 4 + }, + "4096": { + "BLOCK_K": 128, + "BLOCK_M": 64, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 4 + }, + "64": { + "BLOCK_K": 128, + "BLOCK_M": 16, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 4 + }, + "8": { + "BLOCK_K": 128, + "BLOCK_M": 8, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 8 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..5f06f89508 --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/scaled_mm_per_token:v1/{K=2048,N=2560,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,90 @@ +{ + "1": { + "BLOCK_K": 256, + "BLOCK_M": 8, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 8 + }, + "100": { + "BLOCK_K": 64, + "BLOCK_M": 32, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 8 + }, + "1024": { + "BLOCK_K": 64, + "BLOCK_M": 64, + "BLOCK_N": 128, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 4 + }, + "128": { + "BLOCK_K": 128, + "BLOCK_M": 32, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 4 + }, + "16": { + "BLOCK_K": 128, + "BLOCK_M": 16, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 5, + "num_warps": 4 + }, + "2048": { + "BLOCK_K": 64, + "BLOCK_M": 64, + "BLOCK_N": 128, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 4 + }, + "256": { + "BLOCK_K": 128, + "BLOCK_M": 64, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 8 + }, + "32": { + "BLOCK_K": 128, + "BLOCK_M": 16, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 4 + }, + "4096": { + "BLOCK_K": 64, + "BLOCK_M": 64, + "BLOCK_N": 128, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 4 + }, + "64": { + "BLOCK_K": 256, + "BLOCK_M": 32, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 4, + "num_warps": 4 + }, + "8": { + "BLOCK_K": 256, + "BLOCK_M": 8, + "BLOCK_N": 64, + "GROUP_M": 8, + "num_stages": 3, + "num_warps": 8 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..d0b540f69e --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.float16}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,62 @@ +{ + "1024": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_M": 1, + "BLOCK_N": 64, + "NUM_STAGES": 2, + "num_warps": 4 + }, + "16384": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "2048": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_M": 1, + "BLOCK_N": 64, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_M": 1, + "BLOCK_N": 64, + "NUM_STAGES": 1, + "num_warps": 4 + }, + "8": { + "BLOCK_M": 1, + "BLOCK_N": 32, + "NUM_STAGES": 1, + "num_warps": 8 + }, + "800": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 2, + "num_warps": 4 + }, + "8192": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + } +} \ No newline at end of file diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 0000000000..6c5307023b --- /dev/null +++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.6.0/NVIDIA_GeForce_RTX_5090/silu_and_mul_fwd:v1/{N=384,out_dtype=torch.bfloat16}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1,68 @@ +{ + "1024": { + "BLOCK_M": 8, + "BLOCK_N": 64, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "128": { + "BLOCK_M": 1, + "BLOCK_N": 64, + "NUM_STAGES": 4, + "num_warps": 4 + }, + "16384": { + "BLOCK_M": 8, + "BLOCK_N": 128, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "2048": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "256": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "32768": { + "BLOCK_M": 8, + "BLOCK_N": 128, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "512": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 2, + "num_warps": 4 + }, + "64": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 4 + }, + "8": { + "BLOCK_M": 1, + "BLOCK_N": 128, + "NUM_STAGES": 1, + "num_warps": 1 + }, + "800": { + "BLOCK_M": 1, + "BLOCK_N": 256, + "NUM_STAGES": 4, + "num_warps": 1 + }, + "8192": { + "BLOCK_M": 8, + "BLOCK_N": 256, + "NUM_STAGES": 1, + "num_warps": 1 + } +} \ No newline at end of file From 2c09aa270edea34b29f73cae2109103d75073c92 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Thu, 2 Apr 2026 02:15:11 +0000 Subject: [PATCH 13/51] set default model --- lightllm/server/api_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py index 3d9a6bc8ed..3651bf4b64 100644 --- a/lightllm/server/api_models.py +++ b/lightllm/server/api_models.py @@ -187,7 +187,7 @@ def apply_loaded_defaults(cls, data: Any): class ChatCompletionRequest(BaseModel): - model: str + model: str = "default" messages: List[ChatCompletionMessageParam] function_call: Optional[str] = "none" temperature: Optional[float] = 1 From 5168dae05ca72ebfdf51ff75fd1a109310677db2 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Thu, 2 Apr 2026 08:55:53 +0000 Subject: [PATCH 14/51] add prompt_text_cache to QWen3OmniTokenizer --- lightllm/models/qwen3_omni_moe_thinker/model.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py index 4a5131bbf1..6ae73fd1d1 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/model.py +++ b/lightllm/models/qwen3_omni_moe_thinker/model.py @@ -1,6 +1,7 @@ import os import json import librosa +from collections import OrderedDict from io import BytesIO from lightllm.common.build_utils import repair_config from lightllm.models.registry import ModelRegistry @@ -30,6 +31,8 @@ class QWen3OmniTokenizer(QWen3VLTokenizer): def __init__(self, tokenizer=None, processor=None, **kwargs): self.tokenizer = tokenizer + self._prompt_encode_cache = OrderedDict() + self._prompt_encode_cache_capacity = 64 # image self.image_processor = processor.image_processor self.min_pixel = self.image_processor.min_pixels @@ -71,6 +74,18 @@ def get_audio_token_length(self, audio: AudioItem): # print(f"token_num is {token_num} n_samples is {self.n_samples} hop_length is {self.hop_length}") return token_num + def _encode_prompt_text(self, prompt: str): + cached_ids = self._prompt_encode_cache.get(prompt) + if cached_ids is not None: + self._prompt_encode_cache.move_to_end(prompt) + return list(cached_ids) + + origin_ids = self.tokenizer.encode(prompt) + self._prompt_encode_cache[prompt] = tuple(origin_ids) + if len(self._prompt_encode_cache) > self._prompt_encode_cache_capacity: + self._prompt_encode_cache.popitem(last=False) + return origin_ids + def _caclu_audio_token_num(self, input_audio_len: int): _mel_len = input_audio_len // int(self.hop_length) input_lengths_leave = _mel_len % 100 @@ -79,7 +94,7 @@ def _caclu_audio_token_num(self, input_audio_len: int): return output_lengths def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs): - origin_ids = self.tokenizer.encode(prompt) + origin_ids = self._encode_prompt_text(prompt) # -> origin_ids = [token for token in origin_ids if token not in (self.image_token_id, self.audio_token_id)] From 167f8b0e7449cc5a15755f0fe92edb5f5e95cd7f Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Thu, 2 Apr 2026 08:56:50 +0000 Subject: [PATCH 15/51] multi images or audios use asyncio --- lightllm/server/multimodal_params.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index ad70443ca7..ce166b5980 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -1,4 +1,5 @@ """Multimodal parameters for text generation.""" +import asyncio import os import wave import time @@ -240,10 +241,12 @@ def __init__( return async def verify_and_preload(self, request: Request, audio_preload_config: dict = None): - for image in self.images: - await image.preload(request) - for audio in self.audios: - await audio.preload(request, audio_preload_config=audio_preload_config) + preload_coroutines = [image.preload(request) for image in self.images] + preload_coroutines.extend( + audio.preload(request, audio_preload_config=audio_preload_config) for audio in self.audios + ) + if preload_coroutines: + await asyncio.gather(*preload_coroutines) return def to_dict(self): From 30d86034554ade2d9fe350986c76a31526d2b4cc Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Thu, 2 Apr 2026 08:58:05 +0000 Subject: [PATCH 16/51] single file without _resource_lock --- lightllm/server/httpserver/manager.py | 107 +++++++++++++++++--------- 1 file changed, 69 insertions(+), 38 deletions(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 9a6864774a..d7490ebfcd 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -131,6 +131,36 @@ def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str logger.info(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}") return + def _prepare_multimodal_resource_inputs( + self, multimodal_params: MultimodalParams, sampling_params: SamplingParams + ) -> Tuple[List[Union[ImageItem, AudioItem]], List[str], List[int], List[bytes]]: + items, md5sums, tokens_nums, datas = [], [], [], [] + + for img in multimodal_params.images: + self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params) + data = img.read() + token_num = self.tokenizer.get_image_token_length(img) + md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params))) + md5sums.append(md5sum) + tokens_nums.append(token_num) + datas.append(data) + items.append(img) + + for audio in multimodal_params.audios: + self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params) + data = audio.read() + token_num = self.tokenizer.get_audio_token_length(audio) + payload_md5 = audio.extra_params.get("audio_payload_md5") + if payload_md5 is None: + payload_md5 = hashlib.md5(data).hexdigest() + md5sum = payload_md5 + "_" + str(hash(frozendict(audio.extra_params))) + md5sums.append(md5sum) + tokens_nums.append(token_num) + datas.append(data) + items.append(audio) + + return items, md5sums, tokens_nums, datas + async def _alloc_resource(self, items, md5sums, token_nums, datas): while True: @@ -163,34 +193,16 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas): async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams): # 只有 P 和 NORMAL 节点需要真的管理多模态资源 if self.pd_mode.is_P_or_NORMAL(): + items, md5sums, tokens_nums, datas = self._prepare_multimodal_resource_inputs( + multimodal_params, sampling_params + ) + if len(items) <= 1: + await self._alloc_resource(items, md5sums, tokens_nums, datas) + return # 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity,从而造成死锁的问题。 # 如果不加任何锁,假如请求1和请求2都有6张图片,而cache_capacity为10, # 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图,将会资源竞争产生死锁。 async with self._resource_lock: - items, md5sums, tokens_nums, datas = [], [], [], [] - for img in multimodal_params.images: - self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params) - data = img.read() - # must after init_imageitem_extral_params - token_num = self.tokenizer.get_image_token_length(img) - md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params))) - md5sums.append(md5sum) - tokens_nums.append(token_num) - datas.append(data) - items.append(img) - for audio in multimodal_params.audios: - self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params) - data = audio.read() - token_num = self.tokenizer.get_audio_token_length(audio) - payload_md5 = audio.extra_params.get("audio_payload_md5") - if payload_md5 is None: - payload_md5 = hashlib.md5(data).hexdigest() - md5sum = payload_md5 + "_" + str(hash(frozendict(audio.extra_params))) - md5sums.append(md5sum) - tokens_nums.append(token_num) - datas.append(data) - items.append(audio) - await self._alloc_resource(items, md5sums, tokens_nums, datas) return @@ -295,6 +307,13 @@ async def generate( request.state.lightllm_req_id = group_request_id audio_count = len(multimodal_params.audios) if multimodal_params is not None else 0 image_count = len(multimodal_params.images) if multimodal_params is not None else 0 + self._log_stage_timing( + group_request_id, + start_time, + "received", + has_audio=audio_count > 0, + has_image=image_count > 0, + ) try: original_multimodal_params = None @@ -316,7 +335,7 @@ async def generate( # 记录请求到达的相关信息 await self._log_req_header(request_headers, group_request_id) # encode - prompt_ids = await self._encode(prompt, multimodal_params, sampling_params) + prompt_ids = await self._encode(prompt, multimodal_params, sampling_params, start_time=start_time) self._log_stage_timing( group_request_id, start_time, @@ -481,7 +500,11 @@ async def _log_req_header(self, request_headers, group_request_id: int): return async def _encode( - self, prompt: Union[str, List[int]], multimodal_params: MultimodalParams, sampling_params: SamplingParams + self, + prompt: Union[str, List[int]], + multimodal_params: MultimodalParams, + sampling_params: SamplingParams, + start_time: Optional[float] = None, ): if isinstance(prompt, str): if self.enable_multimodal: @@ -490,15 +513,23 @@ async def _encode( ), "too many multimodal items!" if multimodal_params.audios: assert not self.args.disable_audio, "audio multimodal not enabled" - encode_start_time = time.time() await self._alloc_multimodal_resources(multimodal_params, sampling_params) log_req_id = getattr(sampling_params, "group_request_id", None) - logger.info( - f"lightllm_req_id:{log_req_id} " - f"stage:alloc_multimodal_resources_done " - f"elapsed_ms:{(time.time() - encode_start_time) * 1000.0:.3f} " - f"audio_count:{len(multimodal_params.audios)} image_count:{len(multimodal_params.images)}" - ) + if start_time is None: + logger.info( + f"lightllm_req_id:{log_req_id} " + f"stage:alloc_multimodal_resources_done " + f"elapsed_ms:0.000 " + f"audio_count:{len(multimodal_params.audios)} image_count:{len(multimodal_params.images)}" + ) + else: + self._log_stage_timing( + log_req_id, + start_time, + "alloc_multimodal_resources_done", + audio_count=len(multimodal_params.audios), + image_count=len(multimodal_params.images), + ) prompt_ids = self.tokenizer.encode( prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens ) @@ -592,7 +623,7 @@ async def transfer_to_next_module( if self.pd_mode.is_P_or_NORMAL(): if not self.args.disable_vision: - logger.info( + logger.debug( f"lightllm_req_id:{group_req_objs.group_req_id} " f"stage:transfer_to_visual " f"target_port:{self.args.visual_port}" @@ -601,7 +632,7 @@ async def transfer_to_next_module( return if not self.args.disable_audio: - logger.info( + logger.debug( f"lightllm_req_id:{group_req_objs.group_req_id} " f"stage:transfer_to_audio " f"target_port:{self.args.audio_port}" @@ -610,7 +641,7 @@ async def transfer_to_next_module( return if self.args.enable_cpu_cache: - logger.info( + logger.debug( f"lightllm_req_id:{group_req_objs.group_req_id} " f"stage:transfer_to_multi_level_kv_cache target_port:{self.args.multi_level_kv_cache_port}" ) @@ -620,7 +651,7 @@ async def transfer_to_next_module( ) return - logger.info( + logger.debug( f"lightllm_req_id:{group_req_objs.group_req_id} " f"stage:transfer_to_router " f"target_port:{self.args.router_port}" @@ -633,7 +664,7 @@ async def transfer_to_next_module( if self.pd_mode.is_D(): # 在 D 模式下,不需要传输真的多模态参数,因为其已经被 P 处理好了 - logger.info( + logger.debug( f"lightllm_req_id:{group_req_objs.group_req_id} " f"stage:transfer_to_router_from_decode " f"target_port:{self.args.router_port}" From db3e63b4ddb827003371d4e14650cdd3374415d3 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Thu, 2 Apr 2026 09:00:00 +0000 Subject: [PATCH 17/51] use deque instead of list --- lightllm/server/audioserver/manager.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py index d54856c265..a8ccb29891 100644 --- a/lightllm/server/audioserver/manager.py +++ b/lightllm/server/audioserver/manager.py @@ -8,6 +8,7 @@ import inspect import setproctitle import time +from collections import deque from typing import Dict, List asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) @@ -43,7 +44,7 @@ def __init__( self.cache_client = rpyc.connect("localhost", args.cache_port, config={"allow_pickle": True}) self.cache_client._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) self.cache_port = args.cache_port - self.waiting_reqs: List[GroupReqIndexes] = [] + self.waiting_reqs = deque() self.model_weightdir = args.model_dir self.tp_world_size = args.tp self.audio_dp = args.audio_dp @@ -140,7 +141,7 @@ async def loop_for_fwd(self): processing_group_reqs = [] audios_need_infer = [] while len(self.waiting_reqs) > 0: - group_req_indexes = self.waiting_reqs.pop(0) + group_req_indexes = self.waiting_reqs.popleft() self._log_req_stage( group_req_indexes.group_req_id, "audio_queue_picked", @@ -174,7 +175,7 @@ async def loop_for_fwd(self): and len(self.waiting_reqs) == 0 and len(pending_audios) < self.infer_batch_size ): - logger.info( + logger.debug( f"audio_batch_ready req_ids:[{group_req_indexes.group_req_id}] " f"audio_count:{len(pending_audios)} infer_batch_size:{self.infer_batch_size} fast_path:1" ) @@ -210,7 +211,7 @@ async def loop_for_fwd(self): [group_req_indexes] if current_req_has_pending_audio else [] ) batch_req_ids = [req.group_req_id for req in batch_reqs] - logger.info( + logger.debug( f"audio_batch_ready req_ids:{batch_req_ids} " f"audio_count:{len(audios_need_infer)} infer_batch_size:{self.infer_batch_size}" ) @@ -250,7 +251,7 @@ async def loop_for_fwd(self): if len(audios_need_infer) > 0: batch_req_ids = [req.group_req_id for req in processing_group_reqs] - logger.info( + logger.debug( f"audio_batch_ready req_ids:{batch_req_ids} " f"audio_count:{len(audios_need_infer)} infer_batch_size:{self.infer_batch_size}" ) @@ -274,7 +275,7 @@ async def loop_for_netio_req(self): while True: recv_req: GroupReqIndexes = await self.zmq_recv_socket.recv_pyobj() if isinstance(recv_req, GroupReqIndexes): - logger.info( + logger.debug( f"audio recv req id {recv_req.group_req_id} " f"audio count {len(recv_req.multimodal_params.audios)}" ) From 878c2f938267f81fb5edc989f31ad93659758df4 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Fri, 3 Apr 2026 04:50:45 +0000 Subject: [PATCH 18/51] chore: format merged audio/httpserver files --- lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 2 +- lightllm/models/whisper/whisper_audio.py | 3 ++- lightllm/server/httpserver/manager.py | 6 ------ 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py index c14df5ff9d..c08dd68a2f 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py @@ -101,7 +101,7 @@ def __init__(self, d_model, encoder_attention_heads, attention_dropout): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {self.num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.attention_dropout = 0.0 self.is_decoder = False self.is_causal = False diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py index 7eb2948281..750bf8e158 100644 --- a/lightllm/models/whisper/whisper_audio.py +++ b/lightllm/models/whisper/whisper_audio.py @@ -11,6 +11,7 @@ from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload + # tokenizer_class removed class WhisperProcessor(ProcessorMixin): r""" @@ -38,7 +39,7 @@ def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True): return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps) def get_T_after_cnn(self, L_in, dilation=1): - for (padding, kernel_size, stride) in eval("[(1,3,1)] + [(1,3,2)] "): + for padding, kernel_size, stride in eval("[(1,3,1)] + [(1,3,2)] "): L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1 L_out = 1 + L_out // stride L_in = L_out diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 94065cfc3a..c9eb4de543 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -164,7 +164,6 @@ def _prepare_multimodal_resource_inputs( return items, md5sums, tokens_nums, datas async def _alloc_resource(self, items, md5sums, token_nums, datas): - while True: records = obtain(self.cache_client.root.alloc(md5sums, token_nums)) @@ -489,7 +488,6 @@ def _count_multimodal_tokens(self, multimodal_params: MultimodalParams) -> Tuple return image_tokens, audio_tokens async def _log_req_header(self, request_headers, group_request_id: int): - x_request_id = request_headers.get("X-Request-Id", "") x_session_id = request_headers.get("X-Session-Id", "") @@ -622,7 +620,6 @@ async def transfer_to_next_module( self, group_req_objs: Optional[GroupReqObjs] = None, ): - if self.pd_mode.is_P_or_NORMAL(): if not self.args.disable_vision: logger.debug( @@ -689,7 +686,6 @@ async def _wait_to_token_package( req_status: "ReqStatus", request: Request, ): - event = req_status.event unfinished_count = sampling_params.best_of out_token_counter = 0 @@ -820,7 +816,6 @@ async def recycle_resource_loop(self): pre_time_mark = time.time() while True: - try: await asyncio.wait_for(self.recycle_event.wait(), timeout=0.02) except asyncio.TimeoutError: @@ -897,7 +892,6 @@ async def handle_loop(self): for _ in range(read_token_count): if not req.out_tokens_queue.is_empty(): - text, src_index, special, count_output_tokens = req.out_tokens_queue.peek() req.cumlogprob += float(req.shm_logprobs.arr[src_index]) metadata = { From ab788d9c41e2311f7cc1f5c41ea2bd2ec849d6db Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Fri, 3 Apr 2026 04:59:16 +0000 Subject: [PATCH 19/51] chore: improve qwen3 omni audio formatting --- lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py index c08dd68a2f..c14df5ff9d 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py @@ -101,7 +101,7 @@ def __init__(self, d_model, encoder_attention_heads, attention_dropout): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {self.num_heads})." ) - self.scaling = self.head_dim**-0.5 + self.scaling = self.head_dim ** -0.5 self.attention_dropout = 0.0 self.is_decoder = False self.is_causal = False From 0570b965a732e2f5a0655164c82844d150a35bdb Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Fri, 3 Apr 2026 05:43:13 +0000 Subject: [PATCH 20/51] =?UTF-8?q?fix=C3=A2=C2=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lightllm/server/audioserver/model_infer/model_rpc.py | 12 ++++++++++++ lightllm/server/core/objs/start_args_type.py | 2 -- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py index 76d5787b48..343816e1fd 100644 --- a/lightllm/server/audioserver/model_infer/model_rpc.py +++ b/lightllm/server/audioserver/model_infer/model_rpc.py @@ -158,6 +158,10 @@ def _get_audio_items_from_infer_queue(self, max_num: int, force_same: bool = Fal return tasks def _get_audio_items_from_store_queue(self, max_num: int) -> List[AudioItem]: + """ + 与 visual 的 _get_image_items_from_store_queue 一致:store 队列中单条为 AudioItem, + 按批取出至多 max_num 条。 + """ tasks = [] task = self.store_queue.get(block=True) tasks.append(task) @@ -172,6 +176,9 @@ def _get_audio_items_from_store_queue(self, max_num: int) -> List[AudioItem]: return tasks def _infer_worker(self): + """ + 与 visual _infer_worker 一致:推理后对每个 item 单独放入 store_queue,由 store 线程批处理再 commit。 + """ torch.cuda.set_device(self.device_id) while True: try: @@ -190,6 +197,7 @@ def _infer_worker(self): self._save_to_cpu_cache(all_embeds=all_embeds, audios=audios) + # 与 visual _store_to_cpu_cache 相同条入队,便于 store 侧按 infer_max_batch_size 聚合 for audio in audios: self.store_queue.put(audio) @@ -208,6 +216,7 @@ def _save_to_cpu_cache(self, all_embeds: List[torch.Tensor], audios: List[AudioI return def _commit_to_cpu_cache(self, audios: List[AudioItem]): + # 与 visual _commit_to_cpu_cache:仅 tp0 通知完成;embed 已在 model.encode 内写入 cache if self.tp_rank_id == 0: for audio in audios: audio.cuda_event.synchronize() @@ -221,6 +230,9 @@ def _commit_to_cpu_cache(self, audios: List[AudioItem]): self._log_latency(audios[0], "set_items_embed") def _store_worker(self): + """ + 与 visual _store_worker 一致:从 store 队列按批取 AudioItem,再 commit 并释放信号量。 + """ while True: try: audios: List[AudioItem] = self._get_audio_items_from_store_queue(max_num=self.infer_max_batch_size) diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py index a920a09710..ac9bd9e180 100644 --- a/lightllm/server/core/objs/start_args_type.py +++ b/lightllm/server/core/objs/start_args_type.py @@ -107,12 +107,10 @@ class StartArgs: push_interval: int = field(default=10) visual_node_id: int = field(default=None) visual_infer_batch_size: int = field(default=None) - audio_infer_batch_size: int = field(default=None) visual_send_batch_size: int = field(default=1) visual_gpu_ids: List[int] = field(default_factory=lambda: [0]) visual_tp: int = field(default=1) visual_dp: int = field(default=1) - audio_dp: int = field(default=1) visual_nccl_ports: List[int] = field(default=None) visual_rpyc_port: Optional[int] = field(default=None) audio_gpu_ids: Optional[List[int]] = field(default=None) From 70aad721087731a2253a7b88a631a9994b53f3c5 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Fri, 3 Apr 2026 06:36:43 +0000 Subject: [PATCH 21/51] fix --- .../qwen3next/triton_kernel/causal_conv1d.py | 20 +++++++++++++++++-- lightllm/server/api_cli.py | 4 ---- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py index c6d099a2d8..3371aca71a 100644 --- a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py +++ b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py @@ -4,8 +4,20 @@ import torch -from sgl_kernel import causal_conv1d_fwd -from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel +from lightllm.utils.log_utils import init_logger + +logger = init_logger(__name__) + +try: + from sgl_kernel import causal_conv1d_fwd + from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel +except ImportError: + causal_conv1d_fwd = None + causal_conv1d_update_kernel = None + logger.warning( + "sgl_kernel is not installed, qwen3next/qwen3.5 causal_conv1d kernels are unavailable. " + "Install `sgl_kernel` before serving those models." + ) def causal_conv1d_fn( @@ -51,6 +63,8 @@ def causal_conv1d_fn( """ if activation not in [None, "silu", "swish"]: raise NotImplementedError("activation must be None, silu, or swish") + if causal_conv1d_fwd is None: + raise ImportError("sgl_kernel is required for qwen3next/qwen3.5 causal_conv1d_fn") if x.stride(-1) != 1: x = x.contiguous() bias = bias.contiguous() if bias is not None else None @@ -103,6 +117,8 @@ def causal_conv1d_update( """ if activation not in [None, "silu", "swish"]: raise NotImplementedError(f"activation must be None, silu, or swish, actual: {activation}") + if causal_conv1d_update_kernel is None: + raise ImportError("sgl_kernel is required for qwen3next/qwen3.5 causal_conv1d_update") activation_val = activation in ["silu", "swish"] unsqueeze = x.dim() == 2 if unsqueeze: diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py index 01bf4d306b..89aeeec833 100644 --- a/lightllm/server/api_cli.py +++ b/lightllm/server/api_cli.py @@ -460,9 +460,6 @@ def make_argument_parser() -> argparse.ArgumentParser: parser.add_argument( "--visual_infer_batch_size", type=int, default=None, help="number of images to process in each inference batch" ) - parser.add_argument( - "--audio_infer_batch_size", type=int, default=None, help="number of audios to process in each inference batch" - ) parser.add_argument( "--visual_send_batch_size", type=int, @@ -477,7 +474,6 @@ def make_argument_parser() -> argparse.ArgumentParser: ) parser.add_argument("--visual_tp", type=int, default=1, help="number of tensort parallel instances for ViT") parser.add_argument("--visual_dp", type=int, default=1, help="number of data parallel instances for ViT") - parser.add_argument("--audio_dp", type=int, default=1, help="number of data parallel instances for audio encoder") parser.add_argument( "--visual_nccl_ports", nargs="+", From 86a16f708d42395e8692022ae28a8805fbcb1b27 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Tue, 7 Apr 2026 02:50:53 +0000 Subject: [PATCH 22/51] fix md5 and --- lightllm/server/httpserver/manager.py | 79 +++++++++++---------------- 1 file changed, 31 insertions(+), 48 deletions(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index c9eb4de543..e9843c8237 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -128,41 +128,9 @@ def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str cost_ms = (time.time() - start_time) * 1000.0 extras = " ".join(f"{k}:{v}" for k, v in kwargs.items()) suffix = f" {extras}" if extras else "" - logger.info(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}") + logger.debug(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}") return - def _prepare_multimodal_resource_inputs( - self, multimodal_params: MultimodalParams, sampling_params: SamplingParams - ) -> Tuple[List[Union[ImageItem, AudioItem]], List[str], List[int], List[bytes]]: - items, md5sums, tokens_nums, datas = [], [], [], [] - - for img in multimodal_params.images: - self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params) - data = img.read() - token_num = self.tokenizer.get_image_token_length(img) - md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params))) - img.md5 = md5sum - md5sums.append(md5sum) - tokens_nums.append(token_num) - datas.append(data) - items.append(img) - - for audio in multimodal_params.audios: - self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params) - data = audio.read() - token_num = self.tokenizer.get_audio_token_length(audio) - payload_md5 = audio.extra_params.get("audio_payload_md5") - if payload_md5 is None: - payload_md5 = hashlib.md5(data).hexdigest() - md5sum = payload_md5 + "_" + str(hash(frozendict(audio.extra_params))) - audio.md5 = md5sum - md5sums.append(md5sum) - tokens_nums.append(token_num) - datas.append(data) - items.append(audio) - - return items, md5sums, tokens_nums, datas - async def _alloc_resource(self, items, md5sums, token_nums, datas): while True: records = obtain(self.cache_client.root.alloc(md5sums, token_nums)) @@ -197,6 +165,29 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, items, md5sums, tokens_nums, datas = self._prepare_multimodal_resource_inputs( multimodal_params, sampling_params ) + items, md5sums, tokens_nums, datas = [], [], [], [] + for img in multimodal_params.images: + self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params) + data = img.read() + token_num = self.tokenizer.get_image_token_length(img) + md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params))) + img.md5 = md5sum + md5sums.append(md5sum) + tokens_nums.append(token_num) + datas.append(data) + items.append(img) + for audio in multimodal_params.audios: + self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params) + data = audio.read() + token_num = self.tokenizer.get_audio_token_length(audio) + payload_md5 = audio.extra_params.get("audio_payload_md5") + md5sum = payload_md5 + audio.md5 = md5sum + md5sums.append(md5sum) + tokens_nums.append(token_num) + datas.append(data) + items.append(audio) + if len(items) <= 1: await self._alloc_resource(items, md5sums, tokens_nums, datas) return @@ -515,21 +506,13 @@ async def _encode( assert not self.args.disable_audio, "audio multimodal not enabled" await self._alloc_multimodal_resources(multimodal_params, sampling_params) log_req_id = getattr(sampling_params, "group_request_id", None) - if start_time is None: - logger.info( - f"lightllm_req_id:{log_req_id} " - f"stage:alloc_multimodal_resources_done " - f"elapsed_ms:0.000 " - f"audio_count:{len(multimodal_params.audios)} image_count:{len(multimodal_params.images)}" - ) - else: - self._log_stage_timing( - log_req_id, - start_time, - "alloc_multimodal_resources_done", - audio_count=len(multimodal_params.audios), - image_count=len(multimodal_params.images), - ) + self._log_stage_timing( + log_req_id, + start_time, + "alloc_multimodal_resources_done", + audio_count=len(multimodal_params.audios), + image_count=len(multimodal_params.images), + ) prompt_ids = self.tokenizer.encode( prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens ) From 46016378357ff382ecc492405bd9c3cdfc4ee6c9 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Tue, 7 Apr 2026 02:51:43 +0000 Subject: [PATCH 23/51] fix md5 --- lightllm/server/multimodal_params.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index ce20e5d657..0aac1874c8 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -13,6 +13,8 @@ from fastapi import Request from lightllm.utils.multimodal_utils import fetch_resource from lightllm.utils.log_utils import init_logger +from frozendict import frozendict + logger = init_logger(__name__) RAW_AUDIO_SHM_FORMAT = "raw_audio_bytes" @@ -118,7 +120,7 @@ async def preload(self, request: Request, audio_preload_config: dict = None): self.extra_params["audio_num_samples"] = int(audio_values.shape[0]) self.extra_params["audio_num_frames"] = int(effective_audio_len // hop_length) self._preload_data = audio_values.tobytes() - self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest() + self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params))) logger.info( f"lightllm_req_id:{req_id} stage:audio_preload_done " f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} " From 16203e4510f89d23acca2d81d4862975eed82d4c Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Tue, 7 Apr 2026 03:00:13 +0000 Subject: [PATCH 24/51] format --- lightllm/server/multimodal_params.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index 0aac1874c8..e62e73fade 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -120,7 +120,9 @@ async def preload(self, request: Request, audio_preload_config: dict = None): self.extra_params["audio_num_samples"] = int(audio_values.shape[0]) self.extra_params["audio_num_frames"] = int(effective_audio_len // hop_length) self._preload_data = audio_values.tobytes() - self.extra_params["audio_payload_md5"] = hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params))) + self.extra_params["audio_payload_md5"] = ( + hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params))) + ) logger.info( f"lightllm_req_id:{req_id} stage:audio_preload_done " f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} " From 93421d28662a45bc8ac86e658561a33a612612ef Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Tue, 7 Apr 2026 03:13:51 +0000 Subject: [PATCH 25/51] using asyncio.to_thread preventing the server from handling other concurrent requests --- lightllm/server/multimodal_params.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index e62e73fade..5847975878 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -94,7 +94,7 @@ async def preload(self, request: Request, audio_preload_config: dict = None): # check if valid audio bytes decode_start = time.time() - audio_values, _ = librosa.load(BytesIO(audio_data), sr=target_sample_rate) + audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=target_sample_rate) audio_values = np.asarray(audio_values, dtype=np.float32) decode_cost_ms = (time.time() - decode_start) * 1000.0 effective_audio_len = max(audio_values.shape[0], min_audio_len) From f7b05898d0948404d685ab5094ed4c1aab2bd27e Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Tue, 7 Apr 2026 06:06:25 +0000 Subject: [PATCH 26/51] fix --- lightllm/server/httpserver/manager.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index e9843c8237..0e4a9b79eb 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -162,9 +162,6 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas): async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams): # 只有 P 和 NORMAL 节点需要真的管理多模态资源 if self.pd_mode.is_P_or_NORMAL(): - items, md5sums, tokens_nums, datas = self._prepare_multimodal_resource_inputs( - multimodal_params, sampling_params - ) items, md5sums, tokens_nums, datas = [], [], [], [] for img in multimodal_params.images: self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params) From 0ea215605bec31a05c33b8b9b6ea1832a8ac6464 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Tue, 7 Apr 2026 08:52:11 +0000 Subject: [PATCH 27/51] fix --- lightllm/utils/multimodal_utils.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/lightllm/utils/multimodal_utils.py b/lightllm/utils/multimodal_utils.py index 5f4fd18516..4b49ea8891 100644 --- a/lightllm/utils/multimodal_utils.py +++ b/lightllm/utils/multimodal_utils.py @@ -4,14 +4,11 @@ import httpx from PIL import Image from io import BytesIO -from urllib.parse import urlparse -from typing import Dict, Optional from fastapi import Request +from functools import lru_cache from lightllm.utils.log_utils import init_logger logger = init_logger(__name__) -_HTTP_CLIENTS: Dict[Optional[str], httpx.AsyncClient] = {} -_LOCAL_RESOURCE_HOSTS = {"127.0.0.1", "localhost", "::1"} def _httpx_async_client_proxy_kwargs(proxy) -> dict: @@ -39,15 +36,17 @@ def image2base64(img_str: str): return base64.b64encode(buffer.getvalue()).decode("utf-8") +@lru_cache(maxsize=256) +def _get_xhttp_client(proxy=None): + kvargs = _httpx_async_client_proxy_kwargs(proxy) + kvargs["limits"] = httpx.Limits(max_connections=10000, max_keepalive_connections=20) + return httpx.AsyncClient(**kvargs) + + async def fetch_resource(url, request: Request, timeout, proxy=None): logger.info(f"Begin to download resource from url: {url}") start_time = time.time() - hostname = urlparse(url).hostname - effective_proxy = None if hostname in _LOCAL_RESOURCE_HOSTS else proxy - client = _HTTP_CLIENTS.get(effective_proxy) - if client is None: - client = httpx.AsyncClient(**_httpx_async_client_proxy_kwargs(effective_proxy)) - _HTTP_CLIENTS[effective_proxy] = client + client = _get_xhttp_client(proxy) async with client.stream("GET", url, timeout=timeout) as response: response.raise_for_status() ans_bytes = [] From 6856540018aff2b4614b64e1af88374da721ac84 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Tue, 7 Apr 2026 13:20:21 +0000 Subject: [PATCH 28/51] fix --- .../qwen3next/triton_kernel/causal_conv1d.py | 23 ++++--------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py index 3371aca71a..2bf325340f 100644 --- a/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py +++ b/lightllm/models/qwen3next/triton_kernel/causal_conv1d.py @@ -4,21 +4,6 @@ import torch -from lightllm.utils.log_utils import init_logger - -logger = init_logger(__name__) - -try: - from sgl_kernel import causal_conv1d_fwd - from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel -except ImportError: - causal_conv1d_fwd = None - causal_conv1d_update_kernel = None - logger.warning( - "sgl_kernel is not installed, qwen3next/qwen3.5 causal_conv1d kernels are unavailable. " - "Install `sgl_kernel` before serving those models." - ) - def causal_conv1d_fn( x: torch.Tensor, @@ -63,8 +48,8 @@ def causal_conv1d_fn( """ if activation not in [None, "silu", "swish"]: raise NotImplementedError("activation must be None, silu, or swish") - if causal_conv1d_fwd is None: - raise ImportError("sgl_kernel is required for qwen3next/qwen3.5 causal_conv1d_fn") + from sgl_kernel import causal_conv1d_fwd + if x.stride(-1) != 1: x = x.contiguous() bias = bias.contiguous() if bias is not None else None @@ -117,8 +102,8 @@ def causal_conv1d_update( """ if activation not in [None, "silu", "swish"]: raise NotImplementedError(f"activation must be None, silu, or swish, actual: {activation}") - if causal_conv1d_update_kernel is None: - raise ImportError("sgl_kernel is required for qwen3next/qwen3.5 causal_conv1d_update") + from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel + activation_val = activation in ["silu", "swish"] unsqueeze = x.dim() == 2 if unsqueeze: From 9d0671b7ba3b01c995f8a4e4fefef7fb94d80f8d Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Tue, 7 Apr 2026 13:26:37 +0000 Subject: [PATCH 29/51] use details_log to log --- lightllm/server/httpserver/manager.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 0e4a9b79eb..e2a0dbc4b6 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -125,10 +125,11 @@ def __init__( return def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str, **kwargs): - cost_ms = (time.time() - start_time) * 1000.0 - extras = " ".join(f"{k}:{v}" for k, v in kwargs.items()) - suffix = f" {extras}" if extras else "" - logger.debug(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}") + if self.args.detail_log: + cost_ms = (time.time() - start_time) * 1000.0 + extras = " ".join(f"{k}:{v}" for k, v in kwargs.items()) + suffix = f" {extras}" if extras else "" + logger.debug(f"lightllm_req_id:{group_request_id} stage:{stage} elapsed_ms:{cost_ms:.3f}{suffix}") return async def _alloc_resource(self, items, md5sums, token_nums, datas): From 8e21207325fd8205cedebf6c9f30efa60a152bbb Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Tue, 7 Apr 2026 13:40:33 +0000 Subject: [PATCH 30/51] delete warmup --- .../qwen3_omni_audio.py | 24 --------------- lightllm/models/whisper/whisper_audio.py | 20 ------------- lightllm/server/api_http.py | 7 +---- .../audioserver/model_infer/model_rpc.py | 30 ------------------- lightllm/server/multimodal_params.py | 22 -------------- 5 files changed, 1 insertion(+), 102 deletions(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py index c14df5ff9d..7d525915af 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py @@ -2,9 +2,7 @@ import json import math import torch -import librosa import numpy as np -from io import BytesIO from torch import Tensor, nn from safetensors import safe_open from torch.nn import functional as F @@ -16,10 +14,6 @@ from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor -from lightllm.utils.log_utils import init_logger - - -logger = init_logger(__name__) def _get_feat_extract_output_lengths(input_lengths): @@ -376,21 +370,3 @@ def encode(self, audio_items: List[AudioItem]): all_embeds.append(cur_embed) return all_embeds, audio_items - - @torch.no_grad() - def warmup(self, audio_bytes: bytes): - audio = BytesIO(audio_bytes) - audio, _ = librosa.load(audio, sr=self.processor.sampling_rate) - num_frames = max(audio.shape[0], 480) // self.processor.hop_length - padded_len = ((max(audio.shape[0], 480) + self.processor.hop_length - 1) // self.processor.hop_length) * ( - self.processor.hop_length - ) - if padded_len > audio.shape[0]: - audio = np.pad(audio, (0, padded_len - audio.shape[0]), mode="constant", constant_values=0.0) - input_features, feature_lens = self.processor._preprocess_single_padded(audio, num_frames, device="cpu") - _ = self.forward( - input_features, - feature_lens=feature_lens, - ) - torch.cuda.current_stream().synchronize() - return diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py index 750bf8e158..4cd9619e55 100644 --- a/lightllm/models/whisper/whisper_audio.py +++ b/lightllm/models/whisper/whisper_audio.py @@ -1,10 +1,8 @@ import os import json -import librosa import numpy as np import torch import torch.nn.functional as F -from io import BytesIO from typing import List, Union from safetensors.torch import load_file from transformers.processing_utils import ProcessorMixin @@ -225,21 +223,3 @@ def encode(self, audio_items: List[AudioItem]): ans_embeds.append(cur_embed) return ans_embeds, audio_items - - @torch.no_grad() - def warmup(self, audio_bytes: bytes): - audio = BytesIO(audio_bytes) - audio, _ = librosa.load(audio, sr=16000) - - from .defaults import MIN_AUDIO_LEN - - if audio.shape[0] < MIN_AUDIO_LEN: - audio = np.pad(audio, (0, MIN_AUDIO_LEN - len(audio)), mode="constant", constant_values=0.0) - - batch_audio_lens = np.array([min(audio.shape[0], self.max_length)], dtype=np.int32) - audios, audio_lens_after_cnn = self.audio_processor( - [audio], batch_audio_lens, sampling_rate=16000, return_tensors="pt" - ) - _ = self.forward(audios, audio_lens_after_cnn) - torch.cuda.current_stream().synchronize() - return diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py index 1322168e38..40d20bcd27 100755 --- a/lightllm/server/api_http.py +++ b/lightllm/server/api_http.py @@ -41,7 +41,7 @@ from fastapi.responses import Response, StreamingResponse, JSONResponse from lightllm.server.core.objs.sampling_params import SamplingParams from lightllm.server.core.objs import StartArgs -from .multimodal_params import MultimodalParams, warmup_audio_preload +from .multimodal_params import MultimodalParams from .httpserver.manager import HttpServerManager from .httpserver_for_pd_master.manager import HttpServerManagerForPDMaster from .api_lightllm import lightllm_get_score @@ -389,11 +389,6 @@ async def startup_event(): logger.info("server start up") loop = asyncio.get_event_loop() g_objs.set_args(get_env_start_args()) - if g_objs.args.enable_multimodal and not g_objs.args.disable_audio: - warmup_start = time.time() - logger.info("http_audio_preload_warmup_start") - await warmup_audio_preload() - logger.info(f"http_audio_preload_warmup_done elapsed_ms:{(time.time() - warmup_start) * 1000.0:.3f}") loop.create_task(g_objs.httpserver_manager.handle_loop()) logger.info(f"server start up ok, loop use is {asyncio.get_event_loop()}") return diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py index 343816e1fd..8a04231508 100644 --- a/lightllm/server/audioserver/model_infer/model_rpc.py +++ b/lightllm/server/audioserver/model_infer/model_rpc.py @@ -1,10 +1,6 @@ -import io import queue import threading import time -import wave - -import numpy as np import rpyc import socket import torch @@ -25,17 +21,6 @@ logger = init_logger(__name__) -def _generate_silence_wav_bytes(sample_rate: int = 16000, num_samples: int = 16000) -> bytes: - samples = np.zeros(num_samples, dtype=np.int16) - buffer = io.BytesIO() - with wave.open(buffer, "wb") as wav_file: - wav_file.setnchannels(1) - wav_file.setsampwidth(2) - wav_file.setframerate(sample_rate) - wav_file.writeframes(samples.tobytes()) - return buffer.getvalue() - - class AudioModelRpcServer(rpyc.Service): def exposed_init_model(self, kvargs): kvargs = obtain(kvargs) @@ -74,7 +59,6 @@ def exposed_init_model(self, kvargs): create_meta_data=False, init_shm_data=False, ) - self._auto_warmup_model() self._init_taskes() except Exception as e: print("#" * 16) @@ -87,20 +71,6 @@ def exposed_init_model(self, kvargs): set_random_seed(2147483647) return - def _auto_warmup_model(self): - if not hasattr(self.model, "warmup"): - return - try: - torch.cuda.set_device(self.device_id) - warmup_audio = _generate_silence_wav_bytes() - self.model.warmup(warmup_audio) - logger.info( - f"audio model auto warmup finished on dp_rank_id:{self.dp_rank_id} tp_rank_id:{self.tp_rank_id}" - ) - except Exception as e: - logger.exception(f"audio model auto warmup failed: {e}") - raise - def exposed_run_task(self, audios: List[AudioItem], ref_event_list: List[threading.Event]): try: audios = obtain(audios) diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index 5847975878..79ef2fe028 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -1,7 +1,6 @@ """Multimodal parameters for text generation.""" import asyncio import os -import wave import time import librosa import base64 @@ -25,17 +24,6 @@ DEFAULT_MIN_AUDIO_LEN = 480 -def generate_silence_wav_bytes(sample_rate: int = 16000, duration_seconds: float = 1.0) -> bytes: - num_samples = max(1, int(sample_rate * duration_seconds)) - with BytesIO() as buffer: - with wave.open(buffer, "wb") as wav_file: - wav_file.setnchannels(1) - wav_file.setsampwidth(2) - wav_file.setframerate(sample_rate) - wav_file.writeframes(b"\x00\x00" * num_samples) - return buffer.getvalue() - - def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray: audio_shm_format = extra_params.get("audio_shm_format", RAW_AUDIO_SHM_FORMAT) if audio_shm_format == WAVEFORM_F32_SHM_FORMAT: @@ -273,13 +261,3 @@ def to_origin_dict(self): ret["images"] = [i.to_origin_dict() for i in self.images] ret["audios"] = [a.to_origin_dict() for a in self.audios] return ret - - -async def warmup_audio_preload(): - warmup_audio = AudioItem( - type="base64", - data=base64.b64encode(generate_silence_wav_bytes()).decode("utf-8"), - ) - await warmup_audio.preload(None) - warmup_audio.read() - return From fe39faa1b994802083fa7acd3539a5371eebcbad Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Tue, 7 Apr 2026 14:21:01 +0000 Subject: [PATCH 31/51] delete audio_preload_config --- .../common/basemodel/multimodal_tokenizer.py | 1 - lightllm/models/internvl/model.py | 5 -- .../models/qwen3_omni_moe_thinker/model.py | 5 -- .../qwen3_omni_audio.py | 26 +++---- lightllm/server/api_http.py | 4 +- .../audioserver/model_infer/model_rpc.py | 1 - lightllm/server/httpserver/manager.py | 4 +- lightllm/server/multimodal_params.py | 76 +++---------------- 8 files changed, 24 insertions(+), 98 deletions(-) diff --git a/lightllm/common/basemodel/multimodal_tokenizer.py b/lightllm/common/basemodel/multimodal_tokenizer.py index 872a418bf7..cdcbd7f089 100644 --- a/lightllm/common/basemodel/multimodal_tokenizer.py +++ b/lightllm/common/basemodel/multimodal_tokenizer.py @@ -33,7 +33,6 @@ class BaseMultiModalTokenizer(ABC): def __init__(self, tokenizer, **kwargs): self.tokenizer = tokenizer - self.audio_preload_config = None def __getattr__(self, name): obj_dict = object.__getattribute__(self, "__dict__") diff --git a/lightllm/models/internvl/model.py b/lightllm/models/internvl/model.py index 70c797aeb8..ccb76d3512 100644 --- a/lightllm/models/internvl/model.py +++ b/lightllm/models/internvl/model.py @@ -50,11 +50,6 @@ def __init__(self, tokenizer, model_cfg, **kwargs): self.audio_min_length = MIN_AUDIO_LEN self.audio_max_length = 16000 * 30 - self.audio_preload_config = { - "sampling_rate": 16000, - "hop_length": 160, - "min_audio_len": int(self.audio_min_length), - } def init_imageitem_extral_params( self, img: ImageItem, multi_params: MultimodalParams, sampling_params: SamplingParams diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py index 6ae73fd1d1..79ce939714 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/model.py +++ b/lightllm/models/qwen3_omni_moe_thinker/model.py @@ -45,11 +45,6 @@ def __init__(self, tokenizer=None, processor=None, **kwargs): self.sampling_rate = self.audio_processor.sampling_rate self.n_samples = self.audio_processor.n_samples self.hop_length = self.audio_processor.hop_length - self.audio_preload_config = { - "sampling_rate": int(self.sampling_rate), - "hop_length": int(self.hop_length), - "min_audio_len": int(MIN_AUDIO_LEN), - } self.image_start_id = kwargs["model_cfg"]["vision_start_token_id"] self.image_end_id = kwargs["model_cfg"]["vision_end_token_id"] diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py index 7d525915af..71fdb3f3b1 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py @@ -9,7 +9,7 @@ from typing import Callable, Optional, Union, List from transformers.activations import ACT2FN -from lightllm.server.multimodal_params import AudioItem, WAVEFORM_F32_SHM_FORMAT, load_audio_from_shm_payload +from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd @@ -338,25 +338,19 @@ def encode(self, audio_items: List[AudioItem]): items.append(item) audio_data = read_shm(get_shm_name_data(item.uuid)) audio = load_audio_from_shm_payload(audio_data, item.extra_params, self.processor.sampling_rate) - audio_num_frames = item.extra_params.get("audio_num_frames") else: raise ValueError(f"cannot read audio which type is {type(item)}!") - if audio_num_frames is not None and item.extra_params.get("audio_shm_format") == WAVEFORM_F32_SHM_FORMAT: - input_features, feature_lens = self.processor._preprocess_single_padded( - audio, int(audio_num_frames), device="cpu" - ) + input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True) + if feature_attention_mask is not None: + audio_feature_lengths = torch.sum(feature_attention_mask, dim=1) + input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0) else: - input_features, feature_attention_mask = self.processor._preprocess(audio, return_attention_mask=True) - if feature_attention_mask is not None: - audio_feature_lengths = torch.sum(feature_attention_mask, dim=1) - input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()].permute(1, 0) - else: - audio_feature_lengths = None - - feature_lens = ( - audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1) - ) + audio_feature_lengths = None + + feature_lens = ( + audio_feature_lengths if audio_feature_lengths is not None else feature_attention_mask.sum(-1) + ) audio_features = self.forward( input_features, diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py index 40d20bcd27..50d992bf9c 100755 --- a/lightllm/server/api_http.py +++ b/lightllm/server/api_http.py @@ -300,9 +300,7 @@ async def tokens(request: Request): multimodal_params_dict = request_dict.get("multimodal_params", {}) multimodal_params = MultimodalParams(**multimodal_params_dict) - await multimodal_params.verify_and_preload( - request, audio_preload_config=getattr(g_objs.httpserver_manager.tokenizer, "audio_preload_config", None) - ) + await multimodal_params.verify_and_preload(request) return JSONResponse( { "ntokens": g_objs.httpserver_manager.tokens( diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py index 8a04231508..39a7e06ac3 100644 --- a/lightllm/server/audioserver/model_infer/model_rpc.py +++ b/lightllm/server/audioserver/model_infer/model_rpc.py @@ -8,7 +8,6 @@ from typing import List from transformers.configuration_utils import PretrainedConfig from rpyc.utils.classic import obtain - from lightllm.models.whisper.whisper_audio import WhisperAudioModel from lightllm.models.qwen3_omni_moe_thinker.qwen3_omni_audio import Qwen3OmniMoeAudioEncoder from lightllm.server.multimodal_params import AudioItem diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index e2a0dbc4b6..acfe04850f 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -311,9 +311,7 @@ async def generate( original_multimodal_params = copy.deepcopy(multimodal_params) if self.pd_mode.is_P_or_NORMAL(): - await multimodal_params.verify_and_preload( - request, audio_preload_config=getattr(self.tokenizer, "audio_preload_config", None) - ) + await multimodal_params.verify_and_preload(request) self._log_stage_timing( group_request_id, start_time, diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index 79ef2fe028..f103e54ce5 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -16,28 +16,14 @@ logger = init_logger(__name__) -RAW_AUDIO_SHM_FORMAT = "raw_audio_bytes" -WAVEFORM_F32_SHM_FORMAT = "waveform_f32" -AUDIO_SHM_USE_RAW_ENV = "LIGHTLLM_AUDIO_SHM_USE_RAW" DEFAULT_AUDIO_SAMPLE_RATE = 16000 -DEFAULT_AUDIO_HOP_LENGTH = 160 -DEFAULT_MIN_AUDIO_LEN = 480 def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray: - audio_shm_format = extra_params.get("audio_shm_format", RAW_AUDIO_SHM_FORMAT) - if audio_shm_format == WAVEFORM_F32_SHM_FORMAT: - num_samples = int(extra_params.get("audio_num_samples", 0)) - if num_samples > 0: - return np.frombuffer(audio_data, dtype=np.float32, count=num_samples) - return np.frombuffer(audio_data, dtype=np.float32) - - audio, _ = librosa.load(BytesIO(audio_data), sr=sample_rate) - return np.asarray(audio, dtype=np.float32) - - -def should_use_raw_audio_shm() -> bool: - return os.getenv(AUDIO_SHM_USE_RAW_ENV, "0") == "1" + num_samples = int(extra_params.get("audio_num_samples", 0)) + if num_samples > 0: + return np.frombuffer(audio_data, dtype=np.float32, count=num_samples) + return np.frombuffer(audio_data, dtype=np.float32) class AudioItem: @@ -60,11 +46,8 @@ def __init__(self, **kwargs): self._preload_data = None self.extra_params = {} - async def preload(self, request: Request, audio_preload_config: dict = None): + async def preload(self, request: Request): try: - req_id = getattr(getattr(request, "state", None), "lightllm_req_id", None) - preload_start = time.time() - source_ready_start = preload_start if self._type == "url": timeout = int(os.getenv("REQUEST_TIMEOUT", "5")) proxy = os.getenv("REQUEST_PROXY", None) @@ -73,51 +56,18 @@ async def preload(self, request: Request, audio_preload_config: dict = None): audio_data = base64.b64decode(self._data) else: raise ValueError(f"cannot read audio which type is {self._type}!") - source_ready_cost_ms = (time.time() - source_ready_start) * 1000.0 - - audio_preload_config = audio_preload_config or {} - target_sample_rate = int(audio_preload_config.get("sampling_rate", DEFAULT_AUDIO_SAMPLE_RATE)) - hop_length = int(audio_preload_config.get("hop_length", DEFAULT_AUDIO_HOP_LENGTH)) - min_audio_len = int(audio_preload_config.get("min_audio_len", DEFAULT_MIN_AUDIO_LEN)) # check if valid audio bytes - decode_start = time.time() - audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=target_sample_rate) + audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=DEFAULT_AUDIO_SAMPLE_RATE) audio_values = np.asarray(audio_values, dtype=np.float32) - decode_cost_ms = (time.time() - decode_start) * 1000.0 - effective_audio_len = max(audio_values.shape[0], min_audio_len) - padded_audio_len = ((effective_audio_len + hop_length - 1) // hop_length) * hop_length - if padded_audio_len > audio_values.shape[0]: - audio_values = np.pad( - audio_values, - (0, padded_audio_len - audio_values.shape[0]), - mode="constant", - constant_values=0.0, - ) + from lightllm.models.whisper.defaults import MIN_AUDIO_LEN - self.audio_length = effective_audio_len - if should_use_raw_audio_shm(): - self.extra_params["audio_shm_format"] = RAW_AUDIO_SHM_FORMAT - self.extra_params.pop("audio_sample_rate", None) - self.extra_params.pop("audio_num_samples", None) - self.extra_params.pop("audio_num_frames", None) - self._preload_data = audio_data - else: - self.extra_params["audio_shm_format"] = WAVEFORM_F32_SHM_FORMAT - self.extra_params["audio_sample_rate"] = target_sample_rate - self.extra_params["audio_num_samples"] = int(audio_values.shape[0]) - self.extra_params["audio_num_frames"] = int(effective_audio_len // hop_length) - self._preload_data = audio_values.tobytes() + self.audio_length = max(int(audio_values.shape[0]), MIN_AUDIO_LEN) + self._preload_data = audio_values.tobytes() + self.extra_params["audio_num_samples"] = int(audio_values.shape[0]) self.extra_params["audio_payload_md5"] = ( hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params))) ) - logger.info( - f"lightllm_req_id:{req_id} stage:audio_preload_done " - f"elapsed_ms:{(time.time() - preload_start) * 1000.0:.3f} " - f"source_type:{self._type} source_ready_ms:{source_ready_cost_ms:.3f} " - f"decode_ms:{decode_cost_ms:.3f} audio_length:{self.audio_length} " - f"shm_format:{self.extra_params['audio_shm_format']}" - ) return except Exception as e: @@ -238,11 +188,9 @@ def __init__( self.audios = [AudioItem(**a) for a in audios] return - async def verify_and_preload(self, request: Request, audio_preload_config: dict = None): + async def verify_and_preload(self, request: Request): preload_coroutines = [image.preload(request) for image in self.images] - preload_coroutines.extend( - audio.preload(request, audio_preload_config=audio_preload_config) for audio in self.audios - ) + preload_coroutines.extend(audio.preload(request) for audio in self.audios) if preload_coroutines: await asyncio.gather(*preload_coroutines) return From f1c9f0770a5e8452fbffe62c694f0ccfdbbf7d4c Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Tue, 7 Apr 2026 14:24:16 +0000 Subject: [PATCH 32/51] delete _preprocess_single_padded --- .../qwen3_omni_moe_thinker/audio_process.py | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py index 42eae8edb5..e9dc931886 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py +++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py @@ -105,29 +105,6 @@ def zero_mean_unit_var_norm( return normed_input_values - def _preprocess_single_padded( - self, - raw_speech: np.ndarray, - num_frames: int, - device: Optional[str] = "cpu", - ) -> Tuple[torch.Tensor, torch.Tensor]: - waveform = np.asarray(raw_speech, dtype=np.float32) - if waveform.ndim != 1: - raise ValueError(f"single audio fast path expects 1D waveform, got shape={waveform.shape}") - - extracted = self._torch_extract_fbank_features(waveform[None, :], device) - extracted = np.asarray(extracted, dtype=np.float32) - if extracted.ndim != 3: - raise ValueError(f"unexpected extracted feature shape={extracted.shape}") - - if extracted.shape[-1] < num_frames: - raise ValueError(f"feature frames {extracted.shape[-1]} < requested num_frames {num_frames}") - - compact_features = torch.from_numpy(extracted[:, :, :num_frames]).to(device="cuda", dtype=torch.bfloat16) - compact_features = compact_features[0].contiguous() - feature_lens = torch.tensor([num_frames], device="cuda", dtype=torch.long) - return compact_features, feature_lens - def _preprocess( self, raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]], From 9bee105b4a7e89e27ac11f783872793dcb643ed8 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Wed, 8 Apr 2026 01:58:11 +0000 Subject: [PATCH 33/51] fix --- lightllm/server/multimodal_params.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index f103e54ce5..2e8ed701e4 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -189,10 +189,11 @@ def __init__( return async def verify_and_preload(self, request: Request): - preload_coroutines = [image.preload(request) for image in self.images] - preload_coroutines.extend(audio.preload(request) for audio in self.audios) - if preload_coroutines: - await asyncio.gather(*preload_coroutines) + tasks = [image.preload(request) for image in self.images] + tasks += [audio.preload(request) for audio in self.audios] + + if tasks: + await asyncio.gather(*tasks) return def to_dict(self): From 6c9c49067cd6d1480685ae2636637b9aefe56cd2 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Wed, 8 Apr 2026 02:51:45 +0000 Subject: [PATCH 34/51] fix --- lightllm/server/httpserver/manager.py | 82 +++++++++++++++------------ 1 file changed, 45 insertions(+), 37 deletions(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index acfe04850f..d5dcd37825 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -133,33 +133,48 @@ def _log_stage_timing(self, group_request_id: int, start_time: float, stage: str return async def _alloc_resource(self, items, md5sums, token_nums, datas): - while True: - records = obtain(self.cache_client.root.alloc(md5sums, token_nums)) - - if records is None: - await asyncio.sleep(0.1) - continue - - if isinstance(records, str) and "error" in records: - logger.error(str(records) + "and try to set --embed_cache_storage_size bigger") - raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger") - - update_data_ids = [] - for item, rec, data in zip(items, records, datas): - item: Union[ImageItem, AudioItem] = item - item.uuid = rec["id"] - item.token_id = rec["token_id"] - item.token_num = rec["token_num"] - item.start_index_in_embed_cache = rec["start_index_in_embed_cache"] - - if not rec["data_ready"]: - create_shm(get_shm_name_data(rec["id"]), data) - update_data_ids.append(rec["id"]) - - if update_data_ids: - self.cache_client.root.set_items_data(update_data_ids) + if len(items) == 0: return + for _ in range(1000): + # 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity,从而造成死锁的问题。 + # 如果不加任何锁,假如请求1和请求2都有6张图片,而cache_capacity为10, + # 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图,将会资源竞争产生死锁。 + async with self._resource_lock: + records = obtain(self.cache_client.root.alloc(md5sums, token_nums)) + if records is not None: + break + await asyncio.sleep(0.01) + + # 长时间无法申请到足够资源的时候,则开始进行阻塞式尝试,防止其他请求一起申请相关资源。 + if records is None: + async with self._resource_lock: + while records is None: + records = obtain(self.cache_client.root.alloc(md5sums, token_nums)) + if records is not None: + break + await asyncio.sleep(0.1) + + if isinstance(records, str) and "error" in records: + logger.error(str(records) + "and try to set --embed_cache_storage_size bigger") + raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger") + + update_data_ids = [] + for item, rec, data in zip(items, records, datas): + item: Union[ImageItem, AudioItem] = item + item.uuid = rec["id"] + item.token_id = rec["token_id"] + item.token_num = rec["token_num"] + item.start_index_in_embed_cache = rec["start_index_in_embed_cache"] + + if not rec["data_ready"]: + create_shm(get_shm_name_data(rec["id"]), data) + update_data_ids.append(rec["id"]) + + if update_data_ids: + self.cache_client.root.set_items_data(update_data_ids) + return + async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams): # 只有 P 和 NORMAL 节点需要真的管理多模态资源 if self.pd_mode.is_P_or_NORMAL(): @@ -167,10 +182,11 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, for img in multimodal_params.images: self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params) data = img.read() + # must after init_imageitem_extral_params token_num = self.tokenizer.get_image_token_length(img) md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params))) - img.md5 = md5sum md5sums.append(md5sum) + img.md5 = md5sum tokens_nums.append(token_num) datas.append(data) items.append(img) @@ -178,22 +194,14 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, sampling_params) data = audio.read() token_num = self.tokenizer.get_audio_token_length(audio) - payload_md5 = audio.extra_params.get("audio_payload_md5") - md5sum = payload_md5 - audio.md5 = md5sum + md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(audio.extra_params))) md5sums.append(md5sum) + audio.md5 = md5sum tokens_nums.append(token_num) datas.append(data) items.append(audio) - if len(items) <= 1: - await self._alloc_resource(items, md5sums, tokens_nums, datas) - return - # 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity,从而造成死锁的问题。 - # 如果不加任何锁,假如请求1和请求2都有6张图片,而cache_capacity为10, - # 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图,将会资源竞争产生死锁。 - async with self._resource_lock: - await self._alloc_resource(items, md5sums, tokens_nums, datas) + await self._alloc_resource(items, md5sums, tokens_nums, datas) return async def _release_multimodal_resources(self, multimodal_params: MultimodalParams): From 3b057d0b6c450f167c6f2534e75d74a1c5801f0c Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Wed, 8 Apr 2026 02:55:00 +0000 Subject: [PATCH 35/51] fix --- lightllm/server/httpserver/manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index d5dcd37825..115be4bd38 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -136,7 +136,7 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas): if len(items) == 0: return - for _ in range(1000): + for _ in range(2000): # 这里的锁是为了 防止多个含有多张图片的请求 同时申请的record数量 大于cache_capacity,从而造成死锁的问题。 # 如果不加任何锁,假如请求1和请求2都有6张图片,而cache_capacity为10, # 那么如果某一时刻shm中存在请求1的5张图和请求2的5张图,将会资源竞争产生死锁。 @@ -144,7 +144,7 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas): records = obtain(self.cache_client.root.alloc(md5sums, token_nums)) if records is not None: break - await asyncio.sleep(0.01) + await asyncio.sleep(0.005) # 长时间无法申请到足够资源的时候,则开始进行阻塞式尝试,防止其他请求一起申请相关资源。 if records is None: From a8a8130932a90e1a51c4f94665357ed6127005a3 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Wed, 8 Apr 2026 03:03:35 +0000 Subject: [PATCH 36/51] fix --- lightllm/server/multimodal_params.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index 2e8ed701e4..e45a28db12 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -1,10 +1,8 @@ """Multimodal parameters for text generation.""" import asyncio import os -import time import librosa import base64 -import hashlib import numpy as np from typing import List from io import BytesIO @@ -12,11 +10,9 @@ from fastapi import Request from lightllm.utils.multimodal_utils import fetch_resource from lightllm.utils.log_utils import init_logger -from frozendict import frozendict logger = init_logger(__name__) -DEFAULT_AUDIO_SAMPLE_RATE = 16000 def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray: @@ -58,16 +54,19 @@ async def preload(self, request: Request): raise ValueError(f"cannot read audio which type is {self._type}!") # check if valid audio bytes - audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=DEFAULT_AUDIO_SAMPLE_RATE) + audio_values, _ = await asyncio.to_thread(librosa.load, BytesIO(audio_data), sr=16000) audio_values = np.asarray(audio_values, dtype=np.float32) + from lightllm.models.whisper.defaults import MIN_AUDIO_LEN - self.audio_length = max(int(audio_values.shape[0]), MIN_AUDIO_LEN) + if audio_values.shape[0] < MIN_AUDIO_LEN: + audio_values = np.pad( + audio_values, (0, MIN_AUDIO_LEN - audio_values.shape[0]), mode="constant", constant_values=0.0 + ) + logger.warning(f"audio length is too short, pad to {MIN_AUDIO_LEN}") + + self.audio_length = int(audio_values.shape[0]) self._preload_data = audio_values.tobytes() - self.extra_params["audio_num_samples"] = int(audio_values.shape[0]) - self.extra_params["audio_payload_md5"] = ( - hashlib.md5(self._preload_data).hexdigest() + "_" + str(hash(frozendict(self.extra_params))) - ) return except Exception as e: From 4479a6599423cf3a442cb0a937ad89ab07dac8bc Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Wed, 8 Apr 2026 03:23:01 +0000 Subject: [PATCH 37/51] fix --- .../qwen3_omni_moe_thinker/audio_process.py | 28 ++++++++----------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py index e9dc931886..194914d455 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py +++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py @@ -5,6 +5,7 @@ from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor from transformers.feature_extraction_utils import BatchFeature from transformers.utils import TensorType +from functools import lru_cache class WhisperFeatureExtractor(SequenceFeatureExtractor): @@ -46,32 +47,25 @@ def __init__( norm="slaney", mel_scale="slaney", ) - self._hann_window_cache = {} - self._mel_filters_cache = {} - - def _get_cached_feature_tensors(self, device: Union[str, torch.device]): - device_key = str(device) - window = self._hann_window_cache.get(device_key) - if window is None: - window = torch.hann_window(self.n_fft, device=device) - self._hann_window_cache[device_key] = window - - mel_filters = self._mel_filters_cache.get(device_key) - if mel_filters is None: - mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32) - self._mel_filters_cache[device_key] = mel_filters - return window, mel_filters + + @lru_cache(maxsize=12) + def get_hann_window(self, device: Union[str, torch.device]): + return torch.hann_window(self.n_fft, device=device) + + @lru_cache(maxsize=12) + def get_mel_filters(self, device: Union[str, torch.device]): + return torch.from_numpy(self.mel_filters).to(device, torch.float32) def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu") -> np.ndarray: waveform = torch.from_numpy(waveform).to(device, torch.float32) - window, mel_filters = self._get_cached_feature_tensors(device) + window = self.get_hann_window(device) if self.dither != 0.0: waveform += self.dither * torch.randn(waveform.shape, dtype=waveform.dtype, device=waveform.device) stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True) magnitudes = stft[..., :-1].abs() ** 2 - + mel_filters = self.get_mel_filters(device) mel_spec = mel_filters.T @ magnitudes log_spec = torch.clamp(mel_spec, min=1e-10).log10() From be595131895792f57532be06a5988923935fae20 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Wed, 8 Apr 2026 05:03:39 +0000 Subject: [PATCH 38/51] fix --- lightllm/models/qwen3_omni_moe_thinker/model.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py index 79ce939714..1b8fa0110d 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/model.py +++ b/lightllm/models/qwen3_omni_moe_thinker/model.py @@ -1,7 +1,8 @@ import os import json import librosa -from collections import OrderedDict +import copy +from functools import lru_cache from io import BytesIO from lightllm.common.build_utils import repair_config from lightllm.models.registry import ModelRegistry @@ -31,8 +32,6 @@ class QWen3OmniTokenizer(QWen3VLTokenizer): def __init__(self, tokenizer=None, processor=None, **kwargs): self.tokenizer = tokenizer - self._prompt_encode_cache = OrderedDict() - self._prompt_encode_cache_capacity = 64 # image self.image_processor = processor.image_processor self.min_pixel = self.image_processor.min_pixels @@ -69,16 +68,9 @@ def get_audio_token_length(self, audio: AudioItem): # print(f"token_num is {token_num} n_samples is {self.n_samples} hop_length is {self.hop_length}") return token_num + @lru_cache(maxsize=128) def _encode_prompt_text(self, prompt: str): - cached_ids = self._prompt_encode_cache.get(prompt) - if cached_ids is not None: - self._prompt_encode_cache.move_to_end(prompt) - return list(cached_ids) - origin_ids = self.tokenizer.encode(prompt) - self._prompt_encode_cache[prompt] = tuple(origin_ids) - if len(self._prompt_encode_cache) > self._prompt_encode_cache_capacity: - self._prompt_encode_cache.popitem(last=False) return origin_ids def _caclu_audio_token_num(self, input_audio_len: int): @@ -90,6 +82,7 @@ def _caclu_audio_token_num(self, input_audio_len: int): def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs): origin_ids = self._encode_prompt_text(prompt) + origin_ids = copy.deepcopy(origin_ids) # -> origin_ids = [token for token in origin_ids if token not in (self.image_token_id, self.audio_token_id)] From 3b0e61353c5eb5017c57fa37c49910e868b8b39e Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Wed, 8 Apr 2026 05:19:13 +0000 Subject: [PATCH 39/51] fix --- lightllm/server/multimodal_params.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index e45a28db12..6de86fd8b5 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -8,6 +8,7 @@ from io import BytesIO from PIL import Image from fastapi import Request +from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data from lightllm.utils.multimodal_utils import fetch_resource from lightllm.utils.log_utils import init_logger @@ -15,13 +16,6 @@ logger = init_logger(__name__) -def load_audio_from_shm_payload(audio_data: bytes, extra_params: dict, sample_rate: int) -> np.ndarray: - num_samples = int(extra_params.get("audio_num_samples", 0)) - if num_samples > 0: - return np.frombuffer(audio_data, dtype=np.float32, count=num_samples) - return np.frombuffer(audio_data, dtype=np.float32) - - class AudioItem: def __init__(self, **kwargs): self._type = kwargs["type"] @@ -97,6 +91,12 @@ def to_origin_dict(self): ret["data"] = self._data return ret + def load_audio_from_shm_payload(self) -> np.ndarray: + audio_data = read_shm(get_shm_name_data(self.uuid)) + audio_array = np.frombuffer(audio_data, dtype=np.float32) + assert audio_array.shape[0] == self.audio_length + return audio_array + class ImageItem: def __init__(self, **kwargs): From 56af31d4a1354ef29e434355471540da2a95dc5d Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Wed, 8 Apr 2026 05:21:17 +0000 Subject: [PATCH 40/51] fix --- lightllm/server/multimodal_params.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py index 6de86fd8b5..6210628751 100644 --- a/lightllm/server/multimodal_params.py +++ b/lightllm/server/multimodal_params.py @@ -94,7 +94,9 @@ def to_origin_dict(self): def load_audio_from_shm_payload(self) -> np.ndarray: audio_data = read_shm(get_shm_name_data(self.uuid)) audio_array = np.frombuffer(audio_data, dtype=np.float32) - assert audio_array.shape[0] == self.audio_length + if audio_array.shape[0] != self.audio_length: + logger.error(f"audio length is not match, {audio_array.shape[0]} != {self.audio_length}") + assert audio_array.shape[0] == self.audio_length return audio_array From 4a61198fabbd1e2e116905e5a1333f0b4b9e13ba Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Wed, 8 Apr 2026 05:26:15 +0000 Subject: [PATCH 41/51] fix --- lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py index 71fdb3f3b1..03c57126ff 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py @@ -9,8 +9,7 @@ from typing import Callable, Optional, Union, List from transformers.activations import ACT2FN -from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload -from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data +from lightllm.server.multimodal_params import AudioItem from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor @@ -336,8 +335,8 @@ def encode(self, audio_items: List[AudioItem]): if isinstance(item, AudioItem): uuids.append(item.uuid) items.append(item) - audio_data = read_shm(get_shm_name_data(item.uuid)) - audio = load_audio_from_shm_payload(audio_data, item.extra_params, self.processor.sampling_rate) + assert self.processor.sampling_rate == 16000 + audio = item.load_audio_from_shm_payload() else: raise ValueError(f"cannot read audio which type is {type(item)}!") From ccd4b573e70b8fbbe3af0afffb3cf67caa4c66c1 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Wed, 8 Apr 2026 05:30:02 +0000 Subject: [PATCH 42/51] fix --- lightllm/models/whisper/whisper_audio.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py index 4cd9619e55..aaa29e1c71 100644 --- a/lightllm/models/whisper/whisper_audio.py +++ b/lightllm/models/whisper/whisper_audio.py @@ -6,8 +6,7 @@ from typing import List, Union from safetensors.torch import load_file from transformers.processing_utils import ProcessorMixin -from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data -from lightllm.server.multimodal_params import AudioItem, load_audio_from_shm_payload +from lightllm.server.multimodal_params import AudioItem # tokenizer_class removed @@ -37,7 +36,7 @@ def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True): return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps) def get_T_after_cnn(self, L_in, dilation=1): - for padding, kernel_size, stride in eval("[(1,3,1)] + [(1,3,2)] "): + for (padding, kernel_size, stride) in eval("[(1,3,1)] + [(1,3,2)] "): L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1 L_out = 1 + L_out // stride L_in = L_out @@ -168,8 +167,7 @@ def encode(self, audio_items: List[AudioItem]): if isinstance(item, AudioItem): uuids.append(item.uuid) items.append(item) - audio_data = read_shm(get_shm_name_data(item.uuid)) - audio = load_audio_from_shm_payload(audio_data, item.extra_params, 16000) + audio = item.load_audio_from_shm_payload() else: raise ValueError(f"cannot read audio which type is {type(item)}!") @@ -217,7 +215,9 @@ def encode(self, audio_items: List[AudioItem]): ans_embeds = [] for i in range(len(uuids)): + item = items[i] + # 拼接该 audio 的所有 chunk embedding cur_embed = torch.cat(per_audio_embeds[i], dim=0) ans_embeds.append(cur_embed) From b7d11876a659fb0e12c5886f8db38c4229f50b76 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Wed, 8 Apr 2026 05:49:05 +0000 Subject: [PATCH 43/51] fix --- lightllm/server/httpserver/manager.py | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 115be4bd38..07d5936890 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -618,30 +618,16 @@ async def transfer_to_next_module( return if not self.args.disable_audio: - logger.debug( - f"lightllm_req_id:{group_req_objs.group_req_id} " - f"stage:transfer_to_audio " - f"target_port:{self.args.audio_port}" - ) self.send_to_audio.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL) return if self.args.enable_cpu_cache: - logger.debug( - f"lightllm_req_id:{group_req_objs.group_req_id} " - f"stage:transfer_to_multi_level_kv_cache target_port:{self.args.multi_level_kv_cache_port}" - ) self.send_to_multi_level_kv_cache.send_pyobj( group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL, ) return - logger.debug( - f"lightllm_req_id:{group_req_objs.group_req_id} " - f"stage:transfer_to_router " - f"target_port:{self.args.router_port}" - ) self.send_to_router.send_pyobj( group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL, @@ -650,11 +636,6 @@ async def transfer_to_next_module( if self.pd_mode.is_D(): # 在 D 模式下,不需要传输真的多模态参数,因为其已经被 P 处理好了 - logger.debug( - f"lightllm_req_id:{group_req_objs.group_req_id} " - f"stage:transfer_to_router_from_decode " - f"target_port:{self.args.router_port}" - ) self.send_to_router.send_pyobj( group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL, @@ -673,6 +654,7 @@ async def _wait_to_token_package( req_status: "ReqStatus", request: Request, ): + event = req_status.event unfinished_count = sampling_params.best_of out_token_counter = 0 @@ -715,11 +697,6 @@ async def _wait_to_token_package( first_token_cost_ms = (time.time() - start_time) * 1000 is_first_token = False self.first_time_costs.add(first_token_cost_ms) - logger.info( - f"lightllm_req_id:{group_request_id} " - f"stage:first_token_arrived elapsed_ms:{first_token_cost_ms:.3f} " - f"sub_req_id:{sub_req_id} prompt_tokens:{prompt_tokens}" - ) out_token_counter += 1 @@ -803,6 +780,7 @@ async def recycle_resource_loop(self): pre_time_mark = time.time() while True: + try: await asyncio.wait_for(self.recycle_event.wait(), timeout=0.02) except asyncio.TimeoutError: @@ -879,6 +857,7 @@ async def handle_loop(self): for _ in range(read_token_count): if not req.out_tokens_queue.is_empty(): + text, src_index, special, count_output_tokens = req.out_tokens_queue.peek() req.cumlogprob += float(req.shm_logprobs.arr[src_index]) metadata = { From 40cd0b9882160db09723fe5357832f42908af619 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Wed, 8 Apr 2026 06:08:28 +0000 Subject: [PATCH 44/51] fix --- lightllm/server/httpserver/manager.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 07d5936890..8b7dafeffe 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -301,8 +301,6 @@ async def generate( start_time = time.time() request_headers = request.headers if request is not None else {} group_request_id = self.alloc_req_id(sampling_params, is_health_req) - if request is not None: - request.state.lightllm_req_id = group_request_id audio_count = len(multimodal_params.audios) if multimodal_params is not None else 0 image_count = len(multimodal_params.images) if multimodal_params is not None else 0 self._log_stage_timing( From 284815fb33022f6c5b6fda5679c8e4508dd70c66 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Wed, 8 Apr 2026 06:09:38 +0000 Subject: [PATCH 45/51] fix --- lightllm/server/httpserver/manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 8b7dafeffe..45193e928b 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -307,8 +307,8 @@ async def generate( group_request_id, start_time, "received", - has_audio=audio_count > 0, - has_image=image_count > 0, + audio_count=audio_count, + image_count=image_count, ) try: From fa11c53cde2dc7d60503c77141718b3d871a1c40 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Wed, 8 Apr 2026 06:22:24 +0000 Subject: [PATCH 46/51] fix --- lightllm/server/httpserver/manager.py | 31 +++------------------------ 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 45193e928b..610931784c 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -322,8 +322,6 @@ async def generate( group_request_id, start_time, "verify_and_preload_done", - audio_count=audio_count, - image_count=image_count, ) # 记录请求到达的相关信息 @@ -334,9 +332,6 @@ async def generate( group_request_id, start_time, "encode_done", - prompt_tokens=len(prompt_ids), - audio_count=audio_count, - image_count=image_count, ) prompt_tokens = len(prompt_ids) @@ -350,8 +345,6 @@ async def generate( group_request_id, start_time, "check_and_repair_length_done", - prompt_tokens=len(prompt_ids), - max_new_tokens=sampling_params.max_new_tokens, ) if nixl_pd_upload_websocket is not None and not is_health_req and self.pd_mode.is_NP(): @@ -404,7 +397,6 @@ async def generate( group_request_id, start_time, "shm_req_init_done", - req_count=len(req_objs), ) logger.debug( @@ -423,8 +415,6 @@ async def generate( group_request_id, start_time, "request_forwarded", - has_audio=audio_count > 0, - has_image=image_count > 0, ) results_generator = self._wait_to_token_package( @@ -481,6 +471,7 @@ def _count_multimodal_tokens(self, multimodal_params: MultimodalParams) -> Tuple return image_tokens, audio_tokens async def _log_req_header(self, request_headers, group_request_id: int): + x_request_id = request_headers.get("X-Request-Id", "") x_session_id = request_headers.get("X-Session-Id", "") @@ -493,11 +484,7 @@ async def _log_req_header(self, request_headers, group_request_id: int): return async def _encode( - self, - prompt: Union[str, List[int]], - multimodal_params: MultimodalParams, - sampling_params: SamplingParams, - start_time: Optional[float] = None, + self, prompt: Union[str, List[int]], multimodal_params: MultimodalParams, sampling_params: SamplingParams ): if isinstance(prompt, str): if self.enable_multimodal: @@ -507,14 +494,6 @@ async def _encode( if multimodal_params.audios: assert not self.args.disable_audio, "audio multimodal not enabled" await self._alloc_multimodal_resources(multimodal_params, sampling_params) - log_req_id = getattr(sampling_params, "group_request_id", None) - self._log_stage_timing( - log_req_id, - start_time, - "alloc_multimodal_resources_done", - audio_count=len(multimodal_params.audios), - image_count=len(multimodal_params.images), - ) prompt_ids = self.tokenizer.encode( prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens ) @@ -605,13 +584,9 @@ async def transfer_to_next_module( self, group_req_objs: Optional[GroupReqObjs] = None, ): + if self.pd_mode.is_P_or_NORMAL(): if not self.args.disable_vision: - logger.debug( - f"lightllm_req_id:{group_req_objs.group_req_id} " - f"stage:transfer_to_visual " - f"target_port:{self.args.visual_port}" - ) self.send_to_visual.send_pyobj(group_req_objs.to_group_req_index(), protocol=pickle.HIGHEST_PROTOCOL) return From 44c63d97eb26ac5ef3e946fa6e98acc15ae4fa14 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Wed, 8 Apr 2026 07:34:56 +0000 Subject: [PATCH 47/51] fix --- lightllm/server/httpserver/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 610931784c..c9822ff618 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -327,7 +327,7 @@ async def generate( # 记录请求到达的相关信息 await self._log_req_header(request_headers, group_request_id) # encode - prompt_ids = await self._encode(prompt, multimodal_params, sampling_params, start_time=start_time) + prompt_ids = await self._encode(prompt, multimodal_params, sampling_params) self._log_stage_timing( group_request_id, start_time, From c5cc9952105dd955b04f1ced509ce1294b7227c3 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Wed, 8 Apr 2026 09:09:12 +0000 Subject: [PATCH 48/51] support long audio --- lightllm/models/qwen3_omni_moe_thinker/audio_process.py | 2 +- lightllm/models/qwen3_omni_moe_thinker/model.py | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py index 194914d455..58b223d579 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/audio_process.py +++ b/lightllm/models/qwen3_omni_moe_thinker/audio_process.py @@ -102,7 +102,7 @@ def zero_mean_unit_var_norm( def _preprocess( self, raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]], - truncation: bool = True, + truncation: bool = False, pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_attention_mask: Optional[bool] = None, diff --git a/lightllm/models/qwen3_omni_moe_thinker/model.py b/lightllm/models/qwen3_omni_moe_thinker/model.py index 1b8fa0110d..bee15e3d2a 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/model.py +++ b/lightllm/models/qwen3_omni_moe_thinker/model.py @@ -59,12 +59,7 @@ def init_audioitem_extral_params( return def get_audio_token_length(self, audio: AudioItem): - # 这里得处理对应奖语音长度按照 30 进行限制,后续处理中,超过30的会被截断。 - if audio.audio_length > self.n_samples: - logger.warning(f"audio length {audio.audio_length} exceed max length {self.n_samples}, will be truncated.") - - length = min(audio.audio_length, int(self.n_samples)) - token_num = self._caclu_audio_token_num(length) + token_num = self._caclu_audio_token_num(audio.audio_length) # print(f"token_num is {token_num} n_samples is {self.n_samples} hop_length is {self.hop_length}") return token_num From eb4558a906da5c4c8aa9ab9f8e90d18382f70cc1 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Thu, 9 Apr 2026 04:45:20 +0000 Subject: [PATCH 49/51] add check_long_audio_infer --- .../qwen3_omni_audio.py | 23 +++++++++++++++++++ lightllm/models/whisper/whisper_audio.py | 3 +++ .../audioserver/model_infer/model_rpc.py | 1 + 3 files changed, 27 insertions(+) diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py index 03c57126ff..9fb4e1d1db 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py @@ -10,10 +10,13 @@ from transformers.activations import ACT2FN from lightllm.server.multimodal_params import AudioItem +from lightllm.utils.log_utils import init_logger from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor +logger = init_logger(__name__) + def _get_feat_extract_output_lengths(input_lengths): """ @@ -259,6 +262,7 @@ def load_model(self, weight_dir, config): self.load_state_dict(weight_dict) + @torch.inference_mode() def forward( self, input_features, @@ -327,6 +331,7 @@ def forward( hidden_states = self.proj2(hidden_states) return hidden_states + @torch.inference_mode() def encode(self, audio_items: List[AudioItem]): uuids = [] items: List[AudioItem] = [] @@ -363,3 +368,21 @@ def encode(self, audio_items: List[AudioItem]): all_embeds.append(cur_embed) return all_embeds, audio_items + + @torch.inference_mode() + def check_long_audio_infer(self): + """Exercise forward with mel length chosen so the conv loop runs once with batch dim == conv_chunksize.""" + device = next(self.parameters()).device + frame_len = self.conv_chunksize * (self.n_window * 2) + logger.info( + "check_long_audio_infer: start frame_len=%s conv_chunksize=%s n_window=%s device=%s dtype=%s", + frame_len, + self.conv_chunksize, + self.n_window, + device, + self.data_type, + ) + input_features = torch.zeros(self.num_mel_bins, frame_len, device=device, dtype=self.data_type) + feature_lens = torch.tensor([frame_len], device=device, dtype=torch.long) + out = self.forward(input_features, feature_lens=feature_lens) + logger.info("check_long_audio_infer: done output_shape=%s", tuple(out.shape)) diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py index aaa29e1c71..8a984d29a5 100644 --- a/lightllm/models/whisper/whisper_audio.py +++ b/lightllm/models/whisper/whisper_audio.py @@ -223,3 +223,6 @@ def encode(self, audio_items: List[AudioItem]): ans_embeds.append(cur_embed) return ans_embeds, audio_items + + def check_long_audio_infer(self): + pass diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py index 39a7e06ac3..82919856d9 100644 --- a/lightllm/server/audioserver/model_infer/model_rpc.py +++ b/lightllm/server/audioserver/model_infer/model_rpc.py @@ -51,6 +51,7 @@ def exposed_init_model(self, kvargs): self.model.load_model(weight_dir, model_cfg) self.model = self.model.cuda() + self.model.check_long_audio_infer() self.cache_client = rpyc.connect("localhost", self.cache_port, config={"allow_pickle": True}) self.cache_client._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) From 94ffee1fab412071a93dfc69abaeeb3d0fd43356 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Thu, 9 Apr 2026 07:05:03 +0000 Subject: [PATCH 50/51] add LIGHTLLM_QWEN3_OMNI_CONV_CHUNKSIZE --- lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py index 9fb4e1d1db..c81e1d5859 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py @@ -15,6 +15,8 @@ from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor +QWEN3_OMNI_CONV_CHUNKSIZE = int(os.getenv("LIGHTLLM_QWEN3_OMNI_CONV_CHUNKSIZE", 500)) + logger = init_logger(__name__) @@ -159,7 +161,7 @@ def __init__( activation_function="gelu", output_dim=2048, n_window_infer=800, - conv_chunksize=500, + conv_chunksize=QWEN3_OMNI_CONV_CHUNKSIZE, encoder_attention_heads=20, attention_dropout=0, activation_dropout=0, From 0553276514fff3eef059306b917c8a9f6084dced Mon Sep 17 00:00:00 2001 From: wanzihao <1060304770@qq.com> Date: Thu, 9 Apr 2026 15:14:34 +0800 Subject: [PATCH 51/51] Apply suggestions from code review. Use params.dtype Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../models/qwen3_omni_moe_thinker/qwen3_omni_audio.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py index c81e1d5859..ff49ab160a 100644 --- a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py +++ b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py @@ -374,7 +374,9 @@ def encode(self, audio_items: List[AudioItem]): @torch.inference_mode() def check_long_audio_infer(self): """Exercise forward with mel length chosen so the conv loop runs once with batch dim == conv_chunksize.""" - device = next(self.parameters()).device + params = next(self.parameters()) + device = params.device + dtype = params.dtype frame_len = self.conv_chunksize * (self.n_window * 2) logger.info( "check_long_audio_infer: start frame_len=%s conv_chunksize=%s n_window=%s device=%s dtype=%s", @@ -382,9 +384,9 @@ def check_long_audio_infer(self): self.conv_chunksize, self.n_window, device, - self.data_type, + dtype, ) - input_features = torch.zeros(self.num_mel_bins, frame_len, device=device, dtype=self.data_type) + input_features = torch.zeros(self.num_mel_bins, frame_len, device=device, dtype=dtype) feature_lens = torch.tensor([frame_len], device=device, dtype=torch.long) out = self.forward(input_features, feature_lens=feature_lens) logger.info("check_long_audio_infer: done output_shape=%s", tuple(out.shape))