fix

钮圣虓 · 钮圣虓 · commit fa533e28000f · 2026-03-23T16:10:00.000+08:00
diff --git a/lightllm/models/internvl/model.py b/lightllm/models/internvl/model.py
@@ -56,10 +56,8 @@ def init_imageitem_extral_params(
     ):
         if sampling_params.image_max_patch_num > 0:
             img.extra_params["image_patch_max_num"] = sampling_params.image_max_patch_num
-            return
         elif os.getenv("MAX_PATCH_NUM"):
             img.extra_params["image_patch_max_num"] = int(os.getenv("MAX_PATCH_NUM"))
-            return
         else:
             num_images = len(multi_params.images)
             if num_images == 1:
diff --git a/lightllm/models/qwen_vl/layer_infer/pre_layer_infer.py b/lightllm/models/qwen_vl/layer_infer/pre_layer_infer.py
@@ -50,22 +50,18 @@ def context_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_wei
         img_start_token_ids = []
         img_token_lens = []
         img_start_locs_in_cache = []
-        unique_uids = []
-        all_uids = []
         device = layer_weight.wte_weight_.weight.device
         dtype = layer_weight.wte_weight_.weight.dtype
         hidden_size = layer_weight.wte_weight_.weight.shape[1]
 
-        for _, p in enumerate(infer_state.multimodal_params):
+        for batch_id, p in enumerate(infer_state.multimodal_params):
             for img in p["images"] + p["audios"]:
-                all_uids.append(img["uuid"])
                 # skip the same image
                 if img["token_id"] in img_start_token_ids:
                     continue
                 img_start_token_ids.append(img["token_id"])
                 img_token_lens.append(img["token_num"])
                 img_start_locs_in_cache.append(img["start_index_in_embed_cache"])
-                unique_uids.append(img["uuid"])
         out = torch.zeros((len(input_ids), hidden_size), dtype=dtype, device=device)
 
         from lightllm.server.router.model_infer.infer_batch import g_infer_context
@@ -78,12 +74,19 @@ def context_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_wei
         )
 
         if self.args.enable_remote_vit:
-            for uid, start_index_in_embed_cache in zip(unique_uids, img_start_locs_in_cache):
-                embed_tensor = load_tensor_afs(get_shm_name_embed(uid), self.args.image_embed_dir)
-                self._copy_loaded_embed_to_cache(embed_tensor, cpu_embed_cache_tensor, start_index_in_embed_cache)
+            unique_image_uids = []
+            for _, p in enumerate(infer_state.multimodal_params):
+                for img in p["images"]:
+                    if img["uuid"] in unique_image_uids:
+                        continue
+                    img_uid = img["uuid"]
+                    img_idx = img["start_index_in_embed_cache"]
+                    unique_image_uids.append(img_uid)
+                    embed_tensor = load_tensor_afs(get_shm_name_embed(img_uid), self.args.image_embed_dir)
+                    self._copy_loaded_embed_to_cache(embed_tensor, cpu_embed_cache_tensor, img_idx)
 
-            if all_uids:
-                self.cache_client.root.release(all_uids)
+            if unique_image_uids:
+                self.cache_client.root.release(unique_image_uids)
 
         assert cpu_embed_cache_tensor.shape[2] == hidden_size, (
             f"Dimension mismatch: text weight dimension is {hidden_size}, "
diff --git a/lightllm/server/api_lightllm.py b/lightllm/server/api_lightllm.py
@@ -5,6 +5,7 @@
 from lightllm.server.core.objs.sampling_params import SamplingParams
 from .multimodal_params import MultimodalParams
 from .httpserver.manager import HttpServerManager
+from lightllm.utils.envs_utils import get_env_start_args
 import ujson as json
 
 
@@ -154,13 +155,15 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
 
 async def lightllm_get_image_embedding(request: Request, httpserver_manager: HttpServerManager) -> Response:
     request_dict = await request.json()
-    # request_dict: {'parameters': {'max_new_tokens': 128},
-    # 'multimodal_params': {'images': [{'type': 'base64', 'data': 'base64'}]}}
+    args = get_env_start_args()
+    assert not args.disable_vision
+    assert args.enable_remote_vit
     sample_params_dict = request_dict["parameters"]
     sampling_params = SamplingParams()
     sampling_params.init(tokenizer=None, **sample_params_dict)
     sampling_params.verify()
     multimodal_params_dict = request_dict.get("multimodal_params", {})
+    assert not multimodal_params_dict.get("audios")
     multimodal_params = MultimodalParams(**multimodal_params_dict)
 
     await httpserver_manager.get_image_embeding(sampling_params, multimodal_params, request=request)
diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
@@ -537,6 +537,8 @@ def visual_start(args):
 
     if args.visual_nccl_ports is not None:
         args.visual_nccl_ports = args.visual_nccl_ports[: args.visual_dp]
+    else:
+        args.visual_nccl_ports = visual_nccl_ports
 
     args.router_port = router_port
     args.visual_port = visual_port
diff --git a/lightllm/server/embed_cache/impl/memory_cache_with_redis.py b/lightllm/server/embed_cache/impl/memory_cache_with_redis.py
@@ -54,6 +54,13 @@ def set_items_embed(self, ids: list[int]) -> None:
                 rec = self._records.get(id)
                 if rec is not None:
                     rec.embed = True
+                    # Before the embed becomes ready, concurrent miss requests are only
+                    # tracked by the local record refcount. Materialize the remaining
+                    # pending readers into Redis so each later release has a matching
+                    # remote ref to consume.
+                    pending_remote_readers = max(rec.ref - 1, 0)
+                    for _ in range(pending_remote_readers):
+                        self.redis_cache.query_and_incre(str(id))
                     if rec.ref > 0:
                         self._update_record_ref_by_id(id, -1)
                 # 保留一份 redis 引用，直到真正的消费者读取完成后再 release，
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -139,6 +139,7 @@ async def _alloc_resource(self, items, uuids, token_nums, datas):
                 raise Exception(str(records) + "and try to set --embed_cache_storage_size bigger")
 
             uid_list = []
+            unique_image_uids = []
             for item, rec in zip(items, records):
                 item: Union[ImageItem, AudioItem] = item
                 item.uuid = rec["id"]
@@ -147,11 +148,13 @@ async def _alloc_resource(self, items, uuids, token_nums, datas):
                 item.start_index_in_embed_cache = rec["start_index_in_embed_cache"]
 
                 uid_list.append(rec["id"])
+                if isinstance(item, ImageItem) and rec["id"] not in unique_image_uids:
+                    unique_image_uids.append(rec["id"])
 
-            # # If enable the vit/audio-llm disaggregation, no need to cache the data in the memory of the server
+            # # If enable the vit-llm disaggregation, no need to cache the data in the memory of the server
             if self.args.enable_remote_vit:
                 # 避免远端lru被逐出
-                self.cache_client.root.get_items_embed(uid_list, False)
+                self.cache_client.root.get_items_embed(unique_image_uids, False)
 
             ready_flags = obtain(self.cache_client.root.get_items_data(uid_list))
             update_data_ids = []
@@ -251,6 +254,15 @@ async def loop_for_request(self):
                 sampling_params,
                 multimodal_params,
             ) = await self.multinode_req_manager.recv_pyobj()
+
+            if prompt is None:
+
+                async def image_embedding_wrapper(sampling_params, multimodal_params):
+                    await self.get_image_embeding(sampling_params, multimodal_params, None)
+
+                asyncio.create_task(image_embedding_wrapper(sampling_params, multimodal_params))
+                continue
+
             results_generator = self.generate(prompt, sampling_params, multimodal_params, None)
 
             async def generate_wrapper(results_generator):
@@ -450,7 +462,11 @@ async def get_image_embeding(
             visual_req_status = GroupReqObjs(group_request_id, multimodal_params, None, start_time)
 
             await self.transfer_to_next_module_or_node(
-                None, sampling_params, original_multimodal_params, visual_req_status
+                None,
+                sampling_params,
+                original_multimodal_params,
+                visual_req_status,
+                only_visual=True,
             )
             await self._release_multimodal_resources(multimodal_params)
 
@@ -573,6 +589,7 @@ async def transfer_to_next_module_or_node(
         sampling_params: SamplingParams,
         original_multimodal_params: MultimodalParams,
         group_req_objs: Optional[GroupReqObjs] = None,
+        only_visual: bool = False,
     ):
         # 多节点纯tp 运行模式下，master 节点需要将请求转发给slave节点.
         if self.is_multinode_tp_master:
@@ -582,19 +599,20 @@ async def transfer_to_next_module_or_node(
                     protocol=pickle.HIGHEST_PROTOCOL,
                 )
 
-        await self.transfer_to_next_module(group_req_objs)
+        await self.transfer_to_next_module(group_req_objs, only_visual=only_visual)
         return
 
     async def transfer_to_next_module(
         self,
         group_req_objs: Optional[GroupReqObjs] = None,
+        only_visual: bool = False,
     ):
 
         if self.pd_mode.is_P_or_NORMAL():
             group_req_index = group_req_objs.to_group_req_index()
             if not self.args.disable_vision:
                 await self.vit_manager.send_to_vit(group_req_index, protocol=pickle.HIGHEST_PROTOCOL)
-                if not self.args.enable_remote_vit:
+                if only_visual or not self.args.enable_remote_vit:
                     return
 
             if not self.args.disable_audio: