MemTensor · CaralHsi · Dec 18, 2025 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025
diff --git a/src/memos/mem_reader/multi_modal_struct.py b/src/memos/mem_reader/multi_modal_struct.py
@@ -8,8 +8,9 @@
 from memos.configs.mem_reader import MultiModalStructMemReaderConfig
 from memos.context.context import ContextThreadPoolExecutor
 from memos.mem_reader.read_multi_modal import MultiModalParser, detect_lang
+from memos.mem_reader.read_multi_modal.base import _derive_key
 from memos.mem_reader.simple_struct import PROMPT_DICT, SimpleStructMemReader
-from memos.memories.textual.item import TextualMemoryItem
+from memos.memories.textual.item import TextualMemoryItem, TreeNodeTextualMemoryMetadata
 from memos.templates.tool_mem_prompts import TOOL_TRAJECTORY_PROMPT_EN, TOOL_TRAJECTORY_PROMPT_ZH
 from memos.types import MessagesType
 from memos.utils import timed
@@ -184,6 +185,33 @@ def _concat_multi_modal_memories(
             if window:
                 windows.append(window)
 
+        # Batch compute embeddings for all windows
+        if windows:
+            # Collect all valid windows that need embedding
+            valid_windows = [w for w in windows if w and w.memory]
+
+            if valid_windows:
+                # Collect all texts that need embedding
+                texts_to_embed = [w.memory for w in valid_windows]
+
+                # Batch compute all embeddings at once
+                try:
+                    embeddings = self.embedder.embed(texts_to_embed)
+                    # Fill embeddings back into memory items
+                    for window, embedding in zip(valid_windows, embeddings, strict=True):
+                        window.metadata.embedding = embedding
+                except Exception as e:
+                    logger.error(f"[MultiModalStruct] Error batch computing embeddings: {e}")
+                    # Fallback: compute embeddings individually
+                    for window in valid_windows:
+                        if window.memory:
+                            try:
+                                window.metadata.embedding = self.embedder.embed([window.memory])[0]
+                            except Exception as e2:
+                                logger.error(
+                                    f"[MultiModalStruct] Error computing embedding for item: {e2}"
+                                )
+
         return windows
 
     def _build_window_from_items(
@@ -247,17 +275,35 @@ def _build_window_from_items(
             # If no text content, return None
             return None
 
-        # Create aggregated memory item (similar to _build_fast_node in simple_struct)
+        # Create aggregated memory item without embedding (will be computed in batch later)
         extra_kwargs: dict[str, Any] = {}
         if aggregated_file_ids:
             extra_kwargs["file_ids"] = aggregated_file_ids
-        aggregated_item = self._make_memory_item(
-            value=merged_text,
-            info=info,
-            memory_type=memory_type,
-            tags=["mode:fast"],
-            sources=all_sources,
-            **extra_kwargs,
+
+        # Extract info fields
+        info_ = info.copy()
+        user_id = info_.pop("user_id", "")
+        session_id = info_.pop("session_id", "")
+
+        # Create memory item without embedding (set to None, will be filled in batch)
+        aggregated_item = TextualMemoryItem(
+            memory=merged_text,
+            metadata=TreeNodeTextualMemoryMetadata(
+                user_id=user_id,
+                session_id=session_id,
+                memory_type=memory_type,
+                status="activated",
+                tags=["mode:fast"],
+                key=_derive_key(merged_text),
+                embedding=None,  # Will be computed in batch
+                usage=[],
+                sources=all_sources,
+                background="",
+                confidence=0.99,
+                type="fact",
+                info=info_,
+                **extra_kwargs,
+            ),
         )
 
         return aggregated_item
@@ -282,22 +328,23 @@ def _get_llm_response(
         Returns:
             LLM response dictionary
         """
-        # Try to extract actual text content from sources for better language detection
-        text_for_lang_detection = mem_str
+        # Determine language: prioritize lang from sources (set in fast mode),
+        # fallback to detecting from mem_str if sources don't have lang
+        lang = None
+
+        # First, try to get lang from sources (fast mode already set this)
         if sources:
-            source_texts = []
             for source in sources:
-                if hasattr(source, "content") and source.content:
-                    source_texts.append(source.content)
-                elif isinstance(source, dict) and source.get("content"):
-                    source_texts.append(source.get("content"))
-
-            # If we have text content from sources, use it for language detection
-            if source_texts:
-                text_for_lang_detection = " ".join(source_texts)
-
-        # Use the extracted text for language detection
-        lang = detect_lang(text_for_lang_detection)
+                if hasattr(source, "lang") and source.lang:
+                    lang = source.lang
+                    break
+                elif isinstance(source, dict) and source.get("lang"):
+                    lang = source.get("lang")
+                    break
+
+        # Fallback: detect language from mem_str if no lang from sources
+        if lang is None:
+            lang = detect_lang(mem_str)
 
         # Select prompt template based on prompt_type
         if prompt_type == "doc":
@@ -574,8 +621,13 @@ def _process_multi_modal_data(
             for fast_item in fast_memory_items:
                 sources = fast_item.metadata.sources
                 for source in sources:
+                    lang = getattr(source, "lang", "en")
                     items = self.multi_modal_parser.process_transfer(
-                        source, context_items=[fast_item], custom_tags=custom_tags, info=info
+                        source,
+                        context_items=[fast_item],
+                        custom_tags=custom_tags,
+                        info=info,
+                        lang=lang,
                     )
                     fine_memory_items.extend(items)
             return fine_memory_items
@@ -616,8 +668,9 @@ def _process_transfer_multi_modal_data(
 
         # Part B: get fine multimodal items
         for source in sources:
+            lang = getattr(source, "lang", "en")
             items = self.multi_modal_parser.process_transfer(
-                source, context_items=[raw_node], info=info, custom_tags=custom_tags
+                source, context_items=[raw_node], info=info, custom_tags=custom_tags, lang=lang
             )
             fine_memory_items.extend(items)
         return fine_memory_items

diff --git a/src/memos/mem_reader/read_multi_modal/assistant_parser.py b/src/memos/mem_reader/read_multi_modal/assistant_parser.py
@@ -14,7 +14,8 @@
 )
 from memos.types.openai_chat_completion_types import ChatCompletionAssistantMessageParam
 
-from .base import BaseMessageParser, _derive_key, _extract_text_from_content
+from .base import BaseMessageParser, _add_lang_to_source, _derive_key, _extract_text_from_content
+from .utils import detect_lang
 
 
 logger = get_logger(__name__)
@@ -68,71 +69,90 @@ def create_source(
         sources = []
 
         if isinstance(raw_content, list):
-            # Multimodal: create one SourceMessage per part
+            # Multimodal: first collect all text content to detect overall language
+            text_contents = []
+            for part in raw_content:
+                if isinstance(part, dict):
+                    part_type = part.get("type", "")
+                    if part_type == "text":
+                        text_contents.append(part.get("text", ""))
+                    elif part_type == "refusal":
+                        text_contents.append(part.get("refusal", ""))
+
+            # Detect overall language from all text content
+            overall_lang = "en"  # default
+            if text_contents:
+                combined_text = " ".join(text_contents)
+                overall_lang = detect_lang(combined_text)
             # Note: Assistant messages only support "text" and "refusal" part types
             for part in raw_content:
                 if isinstance(part, dict):
                     part_type = part.get("type", "")
                     if part_type == "text":
-                        sources.append(
-                            SourceMessage(
-                                type="chat",
-                                role=role,
-                                chat_time=chat_time,
-                                message_id=message_id,
-                                content=part.get("text", ""),
-                            )
+                        text_content = part.get("text", "")
+                        source = SourceMessage(
+                            type="chat",
+                            role=role,
+                            chat_time=chat_time,
+                            message_id=message_id,
+                            content=text_content,
                         )
+                        source.lang = overall_lang
+                        sources.append(source)
                     elif part_type == "refusal":
-                        sources.append(
-                            SourceMessage(
-                                type="refusal",
-                                role=role,
-                                chat_time=chat_time,
-                                message_id=message_id,
-                                content=part.get("refusal", ""),
-                            )
+                        refusal_content = part.get("refusal", "")
+                        source = SourceMessage(
+                            type="refusal",
+                            role=role,
+                            chat_time=chat_time,
+                            message_id=message_id,
+                            content=refusal_content,
                         )
+                        source.lang = overall_lang
+                        sources.append(source)
                     else:
                         # Unknown part type - log warning but still create SourceMessage
                         logger.warning(
                             f"[AssistantParser] Unknown part type `{part_type}`. "
                             f"Expected `text` or `refusal`. Creating SourceMessage with placeholder content."
                         )
-                        sources.append(
-                            SourceMessage(
-                                type="chat",
-                                role=role,
-                                chat_time=chat_time,
-                                message_id=message_id,
-                                content=f"[{part_type}]",
-                            )
+                        source = SourceMessage(
+                            type="chat",
+                            role=role,
+                            chat_time=chat_time,
+                            message_id=message_id,
+                            content=f"[{part_type}]",
                         )
+                        source.lang = overall_lang
+                        sources.append(source)
         elif raw_content is not None:
             # Simple message: single SourceMessage
             content = _extract_text_from_content(raw_content)
             if content:
-                sources.append(
-                    SourceMessage(
-                        type="chat",
-                        role=role,
-                        chat_time=chat_time,
-                        message_id=message_id,
-                        content=content,
-                    )
-                )
-
-        # Handle top-level refusal field
-        if refusal:
-            sources.append(
-                SourceMessage(
-                    type="refusal",
+                source = SourceMessage(
+                    type="chat",
                     role=role,
                     chat_time=chat_time,
                     message_id=message_id,
-                    content=refusal,
+                    content=content,
                 )
+                sources.append(_add_lang_to_source(source, content))
+
+        # Handle top-level refusal field
+        if refusal:
+            source = SourceMessage(
+                type="refusal",
+                role=role,
+                chat_time=chat_time,
+                message_id=message_id,
+                content=refusal,
             )
+            # Use overall_lang if we have sources from multimodal content, otherwise detect
+            if sources and hasattr(sources[0], "lang"):
+                source.lang = sources[0].lang
+            else:
+                source = _add_lang_to_source(source, refusal)
+            sources.append(source)
 
         # Handle tool_calls (when content is None or empty)
         if tool_calls:
@@ -141,34 +161,42 @@ def create_source(
                 if isinstance(tool_calls, list | dict)
                 else str(tool_calls)
             )
-            sources.append(
-                SourceMessage(
-                    type="tool_calls",
-                    role=role,
-                    chat_time=chat_time,
-                    message_id=message_id,
-                    content=f"[tool_calls]: {tool_calls_str}",
-                )
+            source = SourceMessage(
+                type="tool_calls",
+                role=role,
+                chat_time=chat_time,
+                message_id=message_id,
+                content=f"[tool_calls]: {tool_calls_str}",
             )
+            # Use overall_lang if we have sources from multimodal content, otherwise default
+            if sources and hasattr(sources[0], "lang"):
+                source.lang = sources[0].lang
+            else:
+                source = _add_lang_to_source(source, None)
+            sources.append(source)
 
         # Handle audio (optional)
         if audio:
             audio_id = audio.get("id", "") if isinstance(audio, dict) else str(audio)
-            sources.append(
-                SourceMessage(
-                    type="audio",
-                    role=role,
-                    chat_time=chat_time,
-                    message_id=message_id,
-                    content=f"[audio]: {audio_id}",
-                )
+            source = SourceMessage(
+                type="audio",
+                role=role,
+                chat_time=chat_time,
+                message_id=message_id,
+                content=f"[audio]: {audio_id}",
             )
-
-        return (
-            sources
-            if len(sources) > 1
-            else (sources[0] if sources else SourceMessage(type="chat", role=role))
-        )
+            # Use overall_lang if we have sources from multimodal content, otherwise default
+            if sources and hasattr(sources[0], "lang"):
+                source.lang = sources[0].lang
+            else:
+                source = _add_lang_to_source(source, None)
+            sources.append(source)
+
+        if not sources:
+            return _add_lang_to_source(SourceMessage(type="chat", role=role), None)
+        if len(sources) > 1:
+            return sources
+        return sources[0]
 
     def rebuild_from_source(
         self,

diff --git a/src/memos/mem_reader/read_multi_modal/base.py b/src/memos/mem_reader/read_multi_modal/base.py
@@ -16,7 +16,7 @@
     TreeNodeTextualMemoryMetadata,
 )
 
-from .utils import get_text_splitter
+from .utils import detect_lang, get_text_splitter
 
 
 logger = log.get_logger(__name__)
@@ -57,6 +57,25 @@ def _extract_text_from_content(content: Any) -> str:
     return str(content)
 
 
+def _add_lang_to_source(source: SourceMessage, content: str | None = None) -> SourceMessage:
+    """
+    Add lang field to SourceMessage based on content.
+
+    Args:
+        source: SourceMessage to add lang field to
+        content: Optional content text for language detection.
+                 If None, uses source.content
+
+    Returns:
+        SourceMessage with lang field added
+    """
+    if not hasattr(source, "lang") or getattr(source, "lang", None) is None:
+        text_for_detection = content or getattr(source, "content", None) or ""
+        lang = detect_lang(text_for_detection)
+        source.lang = lang
+    return source
+
+
 class BaseMessageParser(ABC):
     """Base interface for message type parsers."""