Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
c8b3888
qwen3_vl_moe support prefill_cudagraph
WANDY666 Mar 26, 2026
e7fba3a
add audio dp
WANDY666 Mar 27, 2026
671b5aa
Add startup warmups for HTTP audio preload and per-rank audio workers…
WANDY666 Mar 27, 2026
a387259
add http client cache
WANDY666 Mar 30, 2026
cd89cd6
reduce polling time
WANDY666 Mar 30, 2026
4788980
Optimize audio shm payload handling and cache lookups
WANDY666 Mar 30, 2026
7b05403
cache hann_window/mel_filters
WANDY666 Mar 30, 2026
713c45d
Fix audio preload config to follow tokenizer settings
WANDY666 Mar 30, 2026
65a3ec6
Optimize qwen3 omni audio preprocessing fast path
WANDY666 Mar 31, 2026
2e48008
Add audio server fast path for single pending requests
WANDY666 Mar 31, 2026
456a71a
fix num_frames
WANDY666 Apr 1, 2026
479367d
tune fp8
WANDY666 Apr 2, 2026
2c09aa2
set default model
WANDY666 Apr 2, 2026
5168dae
add prompt_text_cache to QWen3OmniTokenizer
WANDY666 Apr 2, 2026
167f8b0
multi images or audios use asyncio
WANDY666 Apr 2, 2026
30d8603
single file without _resource_lock
WANDY666 Apr 2, 2026
db3e63b
use deque instead of list
WANDY666 Apr 2, 2026
2fbd55d
Merge branch 'main' of https://github.com/ModelTC/LightLLM into optim…
WANDY666 Apr 3, 2026
878c2f9
chore: format merged audio/httpserver files
WANDY666 Apr 3, 2026
ab788d9
chore: improve qwen3 omni audio formatting
WANDY666 Apr 3, 2026
0570b96
fixâ€
WANDY666 Apr 3, 2026
70aad72
fix
WANDY666 Apr 3, 2026
86a16f7
fix md5 and
WANDY666 Apr 7, 2026
4601637
fix md5
WANDY666 Apr 7, 2026
16203e4
format
WANDY666 Apr 7, 2026
93421d2
using asyncio.to_thread preventing the server from handling other con…
WANDY666 Apr 7, 2026
f7b0589
fix
WANDY666 Apr 7, 2026
0ea2156
fix
hiworldwzj Apr 7, 2026
6856540
fix
WANDY666 Apr 7, 2026
9d0671b
use details_log to log
WANDY666 Apr 7, 2026
8e21207
delete warmup
WANDY666 Apr 7, 2026
fe39faa
delete audio_preload_config
WANDY666 Apr 7, 2026
f1c9f07
delete _preprocess_single_padded
WANDY666 Apr 7, 2026
9bee105
fix
hiworldwzj Apr 8, 2026
6c9c490
fix
hiworldwzj Apr 8, 2026
3b057d0
fix
hiworldwzj Apr 8, 2026
a8a8130
fix
hiworldwzj Apr 8, 2026
4479a65
fix
hiworldwzj Apr 8, 2026
be59513
fix
hiworldwzj Apr 8, 2026
3b0e613
fix
hiworldwzj Apr 8, 2026
56af31d
fix
hiworldwzj Apr 8, 2026
4a61198
fix
hiworldwzj Apr 8, 2026
ccd4b57
fix
hiworldwzj Apr 8, 2026
b7d1187
fix
hiworldwzj Apr 8, 2026
40cd0b9
fix
hiworldwzj Apr 8, 2026
284815f
fix
hiworldwzj Apr 8, 2026
fa11c53
fix
hiworldwzj Apr 8, 2026
44c63d9
fix
WANDY666 Apr 8, 2026
c5cc995
support long audio
WANDY666 Apr 8, 2026
eb4558a
add check_long_audio_infer
WANDY666 Apr 9, 2026
94ffee1
add LIGHTLLM_QWEN3_OMNI_CONV_CHUNKSIZE
WANDY666 Apr 9, 2026
73ece22
merge
WANDY666 Apr 9, 2026
0553276
Apply suggestions from code review. Use params.dtype
WANDY666 Apr 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lightllm/models/qwen3_omni_moe_thinker/audio_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def zero_mean_unit_var_norm(
def _preprocess(
self,
raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
truncation: bool = True,
truncation: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_attention_mask: Optional[bool] = None,
Expand Down
7 changes: 1 addition & 6 deletions lightllm/models/qwen3_omni_moe_thinker/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,7 @@ def init_audioitem_extral_params(
return

def get_audio_token_length(self, audio: AudioItem):
# 这里得处理对应奖语音长度按照 30 进行限制,后续处理中,超过30的会被截断。
if audio.audio_length > self.n_samples:
logger.warning(f"audio length {audio.audio_length} exceed max length {self.n_samples}, will be truncated.")

length = min(audio.audio_length, int(self.n_samples))
token_num = self._caclu_audio_token_num(length)
token_num = self._caclu_audio_token_num(audio.audio_length)
# print(f"token_num is {token_num} n_samples is {self.n_samples} hop_length is {self.hop_length}")
return token_num

Expand Down
29 changes: 28 additions & 1 deletion lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,15 @@
from transformers.activations import ACT2FN

from lightllm.server.multimodal_params import AudioItem
from lightllm.utils.log_utils import init_logger
from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
from lightllm.models.qwen3_omni_moe_thinker.audio_process import WhisperFeatureExtractor

QWEN3_OMNI_CONV_CHUNKSIZE = int(os.getenv("LIGHTLLM_QWEN3_OMNI_CONV_CHUNKSIZE", 500))

logger = init_logger(__name__)


def _get_feat_extract_output_lengths(input_lengths):
"""
Expand Down Expand Up @@ -156,7 +161,7 @@ def __init__(
activation_function="gelu",
output_dim=2048,
n_window_infer=800,
conv_chunksize=500,
conv_chunksize=QWEN3_OMNI_CONV_CHUNKSIZE,
encoder_attention_heads=20,
attention_dropout=0,
activation_dropout=0,
Expand Down Expand Up @@ -259,6 +264,7 @@ def load_model(self, weight_dir, config):

self.load_state_dict(weight_dict)

@torch.inference_mode()
def forward(
self,
input_features,
Expand Down Expand Up @@ -327,6 +333,7 @@ def forward(
hidden_states = self.proj2(hidden_states)
return hidden_states

@torch.inference_mode()
def encode(self, audio_items: List[AudioItem]):
uuids = []
items: List[AudioItem] = []
Expand Down Expand Up @@ -363,3 +370,23 @@ def encode(self, audio_items: List[AudioItem]):
all_embeds.append(cur_embed)

return all_embeds, audio_items

@torch.inference_mode()
def check_long_audio_infer(self):
"""Exercise forward with mel length chosen so the conv loop runs once with batch dim == conv_chunksize."""
params = next(self.parameters())
device = params.device
dtype = params.dtype
frame_len = self.conv_chunksize * (self.n_window * 2)
logger.info(
"check_long_audio_infer: start frame_len=%s conv_chunksize=%s n_window=%s device=%s dtype=%s",
frame_len,
self.conv_chunksize,
self.n_window,
device,
dtype,
)
input_features = torch.zeros(self.num_mel_bins, frame_len, device=device, dtype=dtype)
feature_lens = torch.tensor([frame_len], device=device, dtype=torch.long)
out = self.forward(input_features, feature_lens=feature_lens)
logger.info("check_long_audio_infer: done output_shape=%s", tuple(out.shape))
3 changes: 3 additions & 0 deletions lightllm/models/whisper/whisper_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,6 @@ def encode(self, audio_items: List[AudioItem]):
ans_embeds.append(cur_embed)

return ans_embeds, audio_items

def check_long_audio_infer(self):
pass
1 change: 1 addition & 0 deletions lightllm/server/audioserver/model_infer/model_rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def exposed_init_model(self, kvargs):

self.model.load_model(weight_dir, model_cfg)
self.model = self.model.cuda()
self.model.check_long_audio_infer()

self.cache_client = rpyc.connect("localhost", self.cache_port, config={"allow_pickle": True})
self.cache_client._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
Expand Down
Loading