From e2d64d6ca9480fc97c6eddd43a84f44fb6899d3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Wed, 3 Dec 2025 22:56:18 +0800 Subject: [PATCH 1/6] fix: input Pydantic bug --- src/memos/api/product_models.py | 22 +++++++++++----------- src/memos/multi_mem_cube/single_cube.py | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/memos/api/product_models.py b/src/memos/api/product_models.py index f949f6cb5..e8dd7da4c 100644 --- a/src/memos/api/product_models.py +++ b/src/memos/api/product_models.py @@ -6,7 +6,7 @@ # Import message types from core types module from memos.log import get_logger -from memos.types import MessageList, MessagesType, PermissionDict, SearchMode +from memos.types import PermissionDict, SearchMode logger = get_logger(__name__) @@ -56,7 +56,7 @@ class Message(BaseModel): class MemoryCreate(BaseRequest): user_id: str = Field(..., description="User ID") - messages: MessageList | None = Field(None, description="List of messages to store.") + messages: list | None = Field(None, description="List of messages to store.") memory_content: str | None = Field(None, description="Content to store as memory") doc_path: str | None = Field(None, description="Path to document to store") mem_cube_id: str | None = Field(None, description="ID of the memory cube") @@ -83,7 +83,7 @@ class ChatRequest(BaseRequest): writable_cube_ids: list[str] | None = Field( None, description="List of cube IDs user can write for multi-cube chat" ) - history: MessageList | None = Field(None, description="Chat history") + history: list | None = Field(None, description="Chat history") mode: SearchMode = Field(SearchMode.FAST, description="search mode: fast, fine, or mixture") system_prompt: str | None = Field(None, description="Base system prompt to use for chat") top_k: int = Field(10, description="Number of results to return") @@ -165,7 +165,7 @@ class ChatCompleteRequest(BaseRequest): user_id: str = Field(..., description="User ID") query: str = Field(..., description="Chat query message") mem_cube_id: str | None = Field(None, description="Cube ID to use for chat") - history: MessageList | None = Field(None, description="Chat history") + history: list | None = Field(None, description="Chat history") internet_search: bool = Field(False, description="Whether to use internet search") system_prompt: str | None = Field(None, description="Base prompt to use for chat") top_k: int = Field(10, description="Number of results to return") @@ -251,7 +251,7 @@ class MemoryCreateRequest(BaseRequest): """Request model for creating memories.""" user_id: str = Field(..., description="User ID") - messages: str | MessagesType | None = Field(None, description="List of messages to store.") + messages: str | list | None = Field(None, description="List of messages to store.") memory_content: str | None = Field(None, description="Memory content to store") doc_path: str | None = Field(None, description="Path to document to store") mem_cube_id: str | None = Field(None, description="Cube ID") @@ -375,7 +375,7 @@ class APISearchRequest(BaseRequest): ) # ==== Context ==== - chat_history: MessageList | None = Field( + chat_history: list | None = Field( None, description=( "Historical chat messages used internally by algorithms. " @@ -505,7 +505,7 @@ class APIADDRequest(BaseRequest): ) # ==== Input content ==== - messages: MessagesType | None = Field( + messages: list | str | None = Field( None, description=( "List of messages to store. Supports: " @@ -521,7 +521,7 @@ class APIADDRequest(BaseRequest): ) # ==== Chat history ==== - chat_history: MessageList | None = Field( + chat_history: list | None = Field( None, description=( "Historical chat messages used internally by algorithms. " @@ -651,7 +651,7 @@ class APIFeedbackRequest(BaseRequest): "default_session", description="Session ID for soft-filtering memories" ) task_id: str | None = Field(None, description="Task ID for monitering async tasks") - history: MessageList | None = Field(..., description="Chat history") + history: list | None = Field(..., description="Chat history") retrieved_memory_ids: list[str] | None = Field( None, description="Retrieved memory ids at last turn" ) @@ -685,7 +685,7 @@ class APIChatCompleteRequest(BaseRequest): writable_cube_ids: list[str] | None = Field( None, description="List of cube IDs user can write for multi-cube chat" ) - history: MessageList | None = Field(None, description="Chat history") + history: list | None = Field(None, description="Chat history") mode: SearchMode = Field(SearchMode.FAST, description="search mode: fast, fine, or mixture") system_prompt: str | None = Field(None, description="Base system prompt to use for chat") top_k: int = Field(10, description="Number of results to return") @@ -754,7 +754,7 @@ class SuggestionRequest(BaseRequest): user_id: str = Field(..., description="User ID") mem_cube_id: str = Field(..., description="Cube ID") language: Literal["zh", "en"] = Field("zh", description="Language for suggestions") - message: MessagesType | None = Field(None, description="List of messages to store.") + message: str | list | None = Field(None, description="List of messages to store.") # ─── MemOS Client Response Models ────────────────────────────────────────────── diff --git a/src/memos/multi_mem_cube/single_cube.py b/src/memos/multi_mem_cube/single_cube.py index 1ddd2b1b7..e8c28c7db 100644 --- a/src/memos/multi_mem_cube/single_cube.py +++ b/src/memos/multi_mem_cube/single_cube.py @@ -555,7 +555,7 @@ def _process_pref_mem( return [] for message in add_req.messages: - if message.get("role", None) is None: + if isinstance(message, dict) and message.get("role", None) is None: return [] target_session_id = add_req.session_id or "default_session" From a6461491755b0565c35496fc8f09cf47317d191d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Thu, 4 Dec 2025 11:51:21 +0800 Subject: [PATCH 2/6] feat: add image parser --- src/memos/mem_feedback/feedback.py | 2 +- src/memos/mem_reader/multi_modal_struct.py | 4 +- .../mem_reader/read_multi_modal/__init__.py | 3 +- .../read_multi_modal/image_parser.py | 278 +++++++++++++++++- .../read_multi_modal/multi_modal_parser.py | 2 + .../read_multi_modal/user_parser.py | 14 +- .../mem_reader/read_multi_modal/utils.py | 31 ++ src/memos/mem_reader/simple_struct.py | 24 +- src/memos/mem_reader/strategy_struct.py | 3 +- .../textual/prefer_text_memory/extractor.py | 2 +- src/memos/templates/instruction_completion.py | 2 +- src/memos/templates/mem_reader_prompts.py | 58 ++++ 12 files changed, 385 insertions(+), 38 deletions(-) diff --git a/src/memos/mem_feedback/feedback.py b/src/memos/mem_feedback/feedback.py index eed43d66e..49fd382a0 100644 --- a/src/memos/mem_feedback/feedback.py +++ b/src/memos/mem_feedback/feedback.py @@ -17,7 +17,7 @@ from memos.mem_feedback.base import BaseMemFeedback from memos.mem_feedback.utils import should_keep_update, split_into_chunks from memos.mem_reader.factory import MemReaderFactory -from memos.mem_reader.simple_struct import detect_lang +from memos.mem_reader.read_multi_modal import detect_lang from memos.memories.textual.item import TextualMemoryItem, TreeNodeTextualMemoryMetadata from memos.memories.textual.tree_text_memory.organize.manager import ( MemoryManager, diff --git a/src/memos/mem_reader/multi_modal_struct.py b/src/memos/mem_reader/multi_modal_struct.py index e0aa40913..7da013b48 100644 --- a/src/memos/mem_reader/multi_modal_struct.py +++ b/src/memos/mem_reader/multi_modal_struct.py @@ -7,8 +7,8 @@ from memos import log from memos.configs.mem_reader import MultiModalStructMemReaderConfig from memos.context.context import ContextThreadPoolExecutor -from memos.mem_reader.read_multi_modal import MultiModalParser -from memos.mem_reader.simple_struct import SimpleStructMemReader, detect_lang +from memos.mem_reader.read_multi_modal import MultiModalParser, detect_lang +from memos.mem_reader.simple_struct import SimpleStructMemReader from memos.memories.textual.item import TextualMemoryItem from memos.templates.tool_mem_prompts import TOOL_TRAJECTORY_PROMPT_EN, TOOL_TRAJECTORY_PROMPT_ZH from memos.types import MessagesType diff --git a/src/memos/mem_reader/read_multi_modal/__init__.py b/src/memos/mem_reader/read_multi_modal/__init__.py index 3ac074226..925afa3ec 100644 --- a/src/memos/mem_reader/read_multi_modal/__init__.py +++ b/src/memos/mem_reader/read_multi_modal/__init__.py @@ -23,7 +23,7 @@ from .text_content_parser import TextContentParser from .tool_parser import ToolParser from .user_parser import UserParser -from .utils import coerce_scene_data, extract_role +from .utils import coerce_scene_data, detect_lang, extract_role __all__ = [ @@ -38,5 +38,6 @@ "ToolParser", "UserParser", "coerce_scene_data", + "detect_lang", "extract_role", ] diff --git a/src/memos/mem_reader/read_multi_modal/image_parser.py b/src/memos/mem_reader/read_multi_modal/image_parser.py index 610bc122f..88991fbe7 100644 --- a/src/memos/mem_reader/read_multi_modal/image_parser.py +++ b/src/memos/mem_reader/read_multi_modal/image_parser.py @@ -1,14 +1,23 @@ """Parser for image_url content parts.""" +import json +import re + from typing import Any from memos.embedders.base import BaseEmbedder from memos.llms.base import BaseLLM from memos.log import get_logger -from memos.memories.textual.item import SourceMessage, TextualMemoryItem +from memos.memories.textual.item import ( + SourceMessage, + TextualMemoryItem, + TreeNodeTextualMemoryMetadata, +) +from memos.templates.mem_reader_prompts import IMAGE_ANALYSIS_PROMPT_EN, IMAGE_ANALYSIS_PROMPT_ZH from memos.types.openai_chat_completion_types import ChatCompletionContentPartImageParam -from .base import BaseMessageParser +from .base import BaseMessageParser, _derive_key +from .utils import detect_lang logger = get_logger(__name__) @@ -43,7 +52,7 @@ def create_source( detail = "auto" return SourceMessage( type="image", - content=f"[image_url]: {url}", + content=url, original_part=message, url=url, detail=detail, @@ -87,7 +96,262 @@ def parse_fine( info: dict[str, Any], **kwargs, ) -> list[TextualMemoryItem]: - """Parse image_url in fine mode - placeholder for future vision model integration.""" - # Fine mode processing would use vision models to extract text from images - # For now, return empty list - return [] + """ + Parse image_url in fine mode using vision models to extract information from images. + + Args: + message: Image message to parse + info: Dictionary containing user_id and session_id + **kwargs: Additional parameters (e.g., context_items, custom_tags) + + Returns: + List of TextualMemoryItem objects extracted from the image + """ + if not self.llm: + logger.warning("[ImageParser] LLM not available for fine mode processing") + return [] + + # Extract image information + if not isinstance(message, dict): + logger.warning(f"[ImageParser] Expected dict, got {type(message)}") + return [] + + image_url = message.get("image_url", {}) + if isinstance(image_url, dict): + url = image_url.get("url", "") + detail = image_url.get("detail", "auto") + else: + url = str(image_url) + detail = "auto" + + if not url: + logger.warning("[ImageParser] No image URL found in message") + return [] + + # Create source for this image + source = self.create_source(message, info) + + # Get context items if available + context_items = kwargs.get("context_items") + + # Determine language from context if available + lang = "en" + if context_items: + for item in context_items: + if hasattr(item, "memory") and item.memory: + lang = detect_lang(item.memory) + break + + # Select prompt based on language + image_analysis_prompt = ( + IMAGE_ANALYSIS_PROMPT_ZH if lang == "zh" else IMAGE_ANALYSIS_PROMPT_EN + ) + + # Build messages with image content + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": image_analysis_prompt}, + { + "type": "image_url", + "image_url": { + "url": url, + "detail": detail, + }, + }, + ], + } + ] + + # Add context if available + if context_items: + context_text = "" + for item in context_items: + if hasattr(item, "memory") and item.memory: + context_text += f"{item.memory}\n" + if context_text: + messages.insert( + 0, + { + "role": "system", + "content": f"Context from previous conversation:\n{context_text}", + }, + ) + + try: + # Call LLM with vision model + response_text = self.llm.generate(messages) + if not response_text: + logger.warning("[ImageParser] Empty response from LLM") + return [] + + # Parse JSON response + response_json = self._parse_json_result(response_text) + + # Extract memory items from response + memory_items = [] + memory_list = response_json.get("memory list", []) + + if not memory_list: + logger.warning("[ImageParser] No memory items extracted from image") + # Fallback: create a simple memory item with the summary + summary = response_json.get( + "summary", "Image analyzed but no specific memories extracted." + ) + if summary: + memory_items.append( + self._create_memory_item( + value=summary, + info=info, + memory_type="LongTermMemory", + tags=["image", "visual"], + key=_derive_key(summary), + sources=[source], + background=summary, + ) + ) + return memory_items + + # Create memory items from parsed response + for mem_data in memory_list: + try: + # Normalize memory_type + memory_type = ( + mem_data.get("memory_type", "LongTermMemory") + .replace("长期记忆", "LongTermMemory") + .replace("用户记忆", "UserMemory") + ) + if memory_type not in ["LongTermMemory", "UserMemory"]: + memory_type = "LongTermMemory" + + value = mem_data.get("value", "").strip() + if not value: + continue + + tags = mem_data.get("tags", []) + if not isinstance(tags, list): + tags = [] + # Add image-related tags + if "image" not in [t.lower() for t in tags]: + tags.append("image") + if "visual" not in [t.lower() for t in tags]: + tags.append("visual") + + key = mem_data.get("key", "") + background = response_json.get("summary", "") + + memory_item = self._create_memory_item( + value=value, + info=info, + memory_type=memory_type, + tags=tags, + key=key if key else _derive_key(value), + sources=[source], + background=background, + ) + memory_items.append(memory_item) + except Exception as e: + logger.error(f"[ImageParser] Error creating memory item: {e}") + continue + + return memory_items + + except Exception as e: + logger.error(f"[ImageParser] Error processing image in fine mode: {e}") + # Fallback: create a simple memory item + fallback_value = f"Image analyzed: {url}" + return [ + self._create_memory_item( + value=fallback_value, + info=info, + memory_type="LongTermMemory", + tags=["image", "visual"], + key=_derive_key(fallback_value), + sources=[source], + background="Image processing encountered an error.", + ) + ] + + def _parse_json_result(self, response_text: str) -> dict: + """ + Parse JSON result from LLM response. + Similar to SimpleStructMemReader.parse_json_result. + """ + s = (response_text or "").strip() + + # Try to extract JSON from code blocks + m = re.search(r"```(?:json)?\s*([\s\S]*?)```", s, flags=re.I) + s = (m.group(1) if m else s.replace("```", "")).strip() + + # Find first { + i = s.find("{") + if i == -1: + return {} + s = s[i:].strip() + + try: + return json.loads(s) + except json.JSONDecodeError: + pass + + # Try to find the last } or ] + j = max(s.rfind("}"), s.rfind("]")) + if j != -1: + try: + return json.loads(s[: j + 1]) + except json.JSONDecodeError: + pass + + # Try to close brackets + def _cheap_close(t: str) -> str: + t += "}" * max(0, t.count("{") - t.count("}")) + t += "]" * max(0, t.count("[") - t.count("]")) + return t + + t = _cheap_close(s) + try: + return json.loads(t) + except json.JSONDecodeError as e: + if "Invalid \\escape" in str(e): + s = s.replace("\\", "\\\\") + try: + return json.loads(s) + except json.JSONDecodeError: + pass + logger.error(f"[ImageParser] Failed to parse JSON: {e}\nResponse: {response_text}") + return {} + + def _create_memory_item( + self, + value: str, + info: dict[str, Any], + memory_type: str, + tags: list[str], + key: str, + sources: list[SourceMessage], + background: str = "", + ) -> TextualMemoryItem: + """Create a TextualMemoryItem with the given parameters.""" + info_ = info.copy() + user_id = info_.pop("user_id", "") + session_id = info_.pop("session_id", "") + + return TextualMemoryItem( + memory=value, + metadata=TreeNodeTextualMemoryMetadata( + user_id=user_id, + session_id=session_id, + memory_type=memory_type, + status="activated", + tags=tags, + key=key, + embedding=self.embedder.embed([value])[0], + usage=[], + sources=sources, + background=background, + confidence=0.99, + type="fact", + info=info_, + ), + ) diff --git a/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py b/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py index d00639005..a135d7fd2 100644 --- a/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py +++ b/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py @@ -226,6 +226,8 @@ def process_transfer( parser = self.file_content_parser elif source.type == "text": parser = self.text_content_parser + elif source.type in ["image", "image_url"]: + parser = self.image_parser elif source.role: # Chat message, use role parser parser = self.role_parsers.get(source.role) diff --git a/src/memos/mem_reader/read_multi_modal/user_parser.py b/src/memos/mem_reader/read_multi_modal/user_parser.py index 8cf667a4b..c7b8ad4e9 100644 --- a/src/memos/mem_reader/read_multi_modal/user_parser.py +++ b/src/memos/mem_reader/read_multi_modal/user_parser.py @@ -85,8 +85,20 @@ def create_source( original_part=part, ) ) + elif part_type == "image_url": + image_info = part.get("image_url", {}) + sources.append( + SourceMessage( + type="image", + role=role, + chat_time=chat_time, + message_id=message_id, + image_path=image_info.get("url"), + original_part=part, + ) + ) else: - # image_url, input_audio, etc. + # input_audio, etc. sources.append( SourceMessage( type=part_type, diff --git a/src/memos/mem_reader/read_multi_modal/utils.py b/src/memos/mem_reader/read_multi_modal/utils.py index 992011765..9582a258c 100644 --- a/src/memos/mem_reader/read_multi_modal/utils.py +++ b/src/memos/mem_reader/read_multi_modal/utils.py @@ -337,3 +337,34 @@ def coerce_scene_data(scene_data: SceneDataInput, scene_type: str) -> list[Messa # fallback return [str(scene_data)] + + +def detect_lang(text): + """ + Detect the language of the given text (Chinese or English). + + Args: + text: Text to analyze + + Returns: + "zh" for Chinese, "en" for English (default) + """ + try: + if not text or not isinstance(text, str): + return "en" + cleaned_text = text + # remove role and timestamp + cleaned_text = re.sub( + r"\b(user|assistant|query|answer)\s*:", "", cleaned_text, flags=re.IGNORECASE + ) + cleaned_text = re.sub(r"\[[\d\-:\s]+\]", "", cleaned_text) + + # extract chinese characters + chinese_pattern = r"[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f\U0002b820-\U0002ceaf\uf900-\ufaff]" + chinese_chars = re.findall(chinese_pattern, cleaned_text) + text_without_special = re.sub(r"[\s\d\W]", "", cleaned_text) + if text_without_special and len(chinese_chars) / len(text_without_special) > 0.3: + return "zh" + return "en" + except Exception: + return "en" diff --git a/src/memos/mem_reader/simple_struct.py b/src/memos/mem_reader/simple_struct.py index 7f7b16234..f43ad01ba 100644 --- a/src/memos/mem_reader/simple_struct.py +++ b/src/memos/mem_reader/simple_struct.py @@ -16,7 +16,7 @@ from memos.embedders.factory import EmbedderFactory from memos.llms.factory import LLMFactory from memos.mem_reader.base import BaseMemReader -from memos.mem_reader.read_multi_modal import coerce_scene_data +from memos.mem_reader.read_multi_modal import coerce_scene_data, detect_lang from memos.memories.textual.item import ( SourceMessage, TextualMemoryItem, @@ -101,28 +101,6 @@ def _count_tokens_text(s: str) -> int: return zh + max(1, rest // 4) -def detect_lang(text): - try: - if not text or not isinstance(text, str): - return "en" - cleaned_text = text - # remove role and timestamp - cleaned_text = re.sub( - r"\b(user|assistant|query|answer)\s*:", "", cleaned_text, flags=re.IGNORECASE - ) - cleaned_text = re.sub(r"\[[\d\-:\s]+\]", "", cleaned_text) - - # extract chinese characters - chinese_pattern = r"[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f\U0002b820-\U0002ceaf\uf900-\ufaff]" - chinese_chars = re.findall(chinese_pattern, cleaned_text) - text_without_special = re.sub(r"[\s\d\W]", "", cleaned_text) - if text_without_special and len(chinese_chars) / len(text_without_special) > 0.3: - return "zh" - return "en" - except Exception: - return "en" - - def _build_node(idx, message, info, source_info, llm, parse_json_result, embedder): # generate try: diff --git a/src/memos/mem_reader/strategy_struct.py b/src/memos/mem_reader/strategy_struct.py index 21be8bc39..d550d89e9 100644 --- a/src/memos/mem_reader/strategy_struct.py +++ b/src/memos/mem_reader/strategy_struct.py @@ -5,7 +5,8 @@ from memos import log from memos.configs.mem_reader import StrategyStructMemReaderConfig from memos.configs.parser import ParserConfigFactory -from memos.mem_reader.simple_struct import SimpleStructMemReader, detect_lang +from memos.mem_reader.read_multi_modal import detect_lang +from memos.mem_reader.simple_struct import SimpleStructMemReader from memos.parsers.factory import ParserFactory from memos.templates.mem_reader_prompts import ( CUSTOM_TAGS_INSTRUCTION, diff --git a/src/memos/memories/textual/prefer_text_memory/extractor.py b/src/memos/memories/textual/prefer_text_memory/extractor.py index e105500bd..144bfad7f 100644 --- a/src/memos/memories/textual/prefer_text_memory/extractor.py +++ b/src/memos/memories/textual/prefer_text_memory/extractor.py @@ -8,7 +8,7 @@ from memos.context.context import ContextThreadPoolExecutor from memos.log import get_logger -from memos.mem_reader.simple_struct import detect_lang +from memos.mem_reader.read_multi_modal import detect_lang from memos.memories.textual.item import ( PreferenceTextualMemoryMetadata, TextualMemoryItem, diff --git a/src/memos/templates/instruction_completion.py b/src/memos/templates/instruction_completion.py index b88ff474c..74a20ecff 100644 --- a/src/memos/templates/instruction_completion.py +++ b/src/memos/templates/instruction_completion.py @@ -1,6 +1,6 @@ from typing import Any -from memos.mem_reader.simple_struct import detect_lang +from memos.mem_reader.read_multi_modal import detect_lang from memos.templates.prefer_complete_prompt import PREF_INSTRUCTIONS, PREF_INSTRUCTIONS_ZH diff --git a/src/memos/templates/mem_reader_prompts.py b/src/memos/templates/mem_reader_prompts.py index 3223e4694..50afb86f2 100644 --- a/src/memos/templates/mem_reader_prompts.py +++ b/src/memos/templates/mem_reader_prompts.py @@ -359,3 +359,61 @@ CUSTOM_TAGS_INSTRUCTION_ZH = """输出tags可以参考下列标签: {custom_tags} 你可以选择与memory相关的在上述列表中可以加入tags,同时你可以根据memory的内容自由添加tags。""" + + +IMAGE_ANALYSIS_PROMPT_EN = """You are an intelligent memory assistant. Analyze the provided image and extract meaningful information that should be remembered. + +Please extract: +1. **Visual Content**: What objects, people, scenes, or text are visible in the image? +2. **Context**: What is the context or situation depicted? +3. **Key Information**: What important details, facts, or information can be extracted? +4. **User Relevance**: What aspects of this image might be relevant to the user's memory? + +Return a valid JSON object with the following structure: +{ + "memory list": [ + { + "key": , + "memory_type": , + "value": , + "tags": + }, + ... + ], + "summary": +} + +Language rules: +- The `key`, `value`, `tags`, `summary` and `memory_type` fields should match the language of the user's context if available, otherwise use English. +- Keep `memory_type` in English. + +Focus on extracting factual, observable information from the image. Avoid speculation unless clearly relevant to user memory.""" + + +IMAGE_ANALYSIS_PROMPT_ZH = """您是一个智能记忆助手。请分析提供的图像并提取应该被记住的有意义信息。 + +请提取: +1. **视觉内容**:图像中可见的物体、人物、场景或文字是什么? +2. **上下文**:图像描绘了什么情境或情况? +3. **关键信息**:可以提取哪些重要的细节、事实或信息? +4. **用户相关性**:图像的哪些方面可能与用户的记忆相关? + +返回一个有效的 JSON 对象,格式如下: +{ + "memory list": [ + { + "key": <字符串,一个唯一且简洁的记忆标题>, + "memory_type": <字符串,"LongTermMemory" 或 "UserMemory">, + "value": <一个详细、自包含的描述,说明应该从图像中记住什么>, + "tags": <相关关键词列表(例如:["图像", "视觉", "场景", "物体"])> + }, + ... + ], + "summary": <一个自然段落,总结图像内容,120-200字> +} + +语言规则: +- `key`、`value`、`tags`、`summary` 和 `memory_type` 字段应该与用户上下文的语言匹配(如果可用),否则使用中文。 +- `memory_type` 保持英文。 + +专注于从图像中提取事实性、可观察的信息。除非与用户记忆明显相关,否则避免推测。""" From 3a0ad614d1133ecd26d2f863dda23be160f77ebf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Thu, 4 Dec 2025 13:02:55 +0800 Subject: [PATCH 3/6] feat: back to MessagesType --- src/memos/api/product_models.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/memos/api/product_models.py b/src/memos/api/product_models.py index e8dd7da4c..f949f6cb5 100644 --- a/src/memos/api/product_models.py +++ b/src/memos/api/product_models.py @@ -6,7 +6,7 @@ # Import message types from core types module from memos.log import get_logger -from memos.types import PermissionDict, SearchMode +from memos.types import MessageList, MessagesType, PermissionDict, SearchMode logger = get_logger(__name__) @@ -56,7 +56,7 @@ class Message(BaseModel): class MemoryCreate(BaseRequest): user_id: str = Field(..., description="User ID") - messages: list | None = Field(None, description="List of messages to store.") + messages: MessageList | None = Field(None, description="List of messages to store.") memory_content: str | None = Field(None, description="Content to store as memory") doc_path: str | None = Field(None, description="Path to document to store") mem_cube_id: str | None = Field(None, description="ID of the memory cube") @@ -83,7 +83,7 @@ class ChatRequest(BaseRequest): writable_cube_ids: list[str] | None = Field( None, description="List of cube IDs user can write for multi-cube chat" ) - history: list | None = Field(None, description="Chat history") + history: MessageList | None = Field(None, description="Chat history") mode: SearchMode = Field(SearchMode.FAST, description="search mode: fast, fine, or mixture") system_prompt: str | None = Field(None, description="Base system prompt to use for chat") top_k: int = Field(10, description="Number of results to return") @@ -165,7 +165,7 @@ class ChatCompleteRequest(BaseRequest): user_id: str = Field(..., description="User ID") query: str = Field(..., description="Chat query message") mem_cube_id: str | None = Field(None, description="Cube ID to use for chat") - history: list | None = Field(None, description="Chat history") + history: MessageList | None = Field(None, description="Chat history") internet_search: bool = Field(False, description="Whether to use internet search") system_prompt: str | None = Field(None, description="Base prompt to use for chat") top_k: int = Field(10, description="Number of results to return") @@ -251,7 +251,7 @@ class MemoryCreateRequest(BaseRequest): """Request model for creating memories.""" user_id: str = Field(..., description="User ID") - messages: str | list | None = Field(None, description="List of messages to store.") + messages: str | MessagesType | None = Field(None, description="List of messages to store.") memory_content: str | None = Field(None, description="Memory content to store") doc_path: str | None = Field(None, description="Path to document to store") mem_cube_id: str | None = Field(None, description="Cube ID") @@ -375,7 +375,7 @@ class APISearchRequest(BaseRequest): ) # ==== Context ==== - chat_history: list | None = Field( + chat_history: MessageList | None = Field( None, description=( "Historical chat messages used internally by algorithms. " @@ -505,7 +505,7 @@ class APIADDRequest(BaseRequest): ) # ==== Input content ==== - messages: list | str | None = Field( + messages: MessagesType | None = Field( None, description=( "List of messages to store. Supports: " @@ -521,7 +521,7 @@ class APIADDRequest(BaseRequest): ) # ==== Chat history ==== - chat_history: list | None = Field( + chat_history: MessageList | None = Field( None, description=( "Historical chat messages used internally by algorithms. " @@ -651,7 +651,7 @@ class APIFeedbackRequest(BaseRequest): "default_session", description="Session ID for soft-filtering memories" ) task_id: str | None = Field(None, description="Task ID for monitering async tasks") - history: list | None = Field(..., description="Chat history") + history: MessageList | None = Field(..., description="Chat history") retrieved_memory_ids: list[str] | None = Field( None, description="Retrieved memory ids at last turn" ) @@ -685,7 +685,7 @@ class APIChatCompleteRequest(BaseRequest): writable_cube_ids: list[str] | None = Field( None, description="List of cube IDs user can write for multi-cube chat" ) - history: list | None = Field(None, description="Chat history") + history: MessageList | None = Field(None, description="Chat history") mode: SearchMode = Field(SearchMode.FAST, description="search mode: fast, fine, or mixture") system_prompt: str | None = Field(None, description="Base system prompt to use for chat") top_k: int = Field(10, description="Number of results to return") @@ -754,7 +754,7 @@ class SuggestionRequest(BaseRequest): user_id: str = Field(..., description="User ID") mem_cube_id: str = Field(..., description="Cube ID") language: Literal["zh", "en"] = Field("zh", description="Language for suggestions") - message: str | list | None = Field(None, description="List of messages to store.") + message: MessagesType | None = Field(None, description="List of messages to store.") # ─── MemOS Client Response Models ────────────────────────────────────────────── From a1b060018795976fe9213d84c53ee35ddbd43e4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Thu, 4 Dec 2025 14:59:39 +0800 Subject: [PATCH 4/6] fix: other-reader bug --- src/memos/configs/mem_reader.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/memos/configs/mem_reader.py b/src/memos/configs/mem_reader.py index 9b9bee701..a0b72efd1 100644 --- a/src/memos/configs/mem_reader.py +++ b/src/memos/configs/mem_reader.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import Any, ClassVar -from pydantic import Field, field_validator, model_validator +from pydantic import ConfigDict, Field, field_validator, model_validator from memos.configs.base import BaseConfig from memos.configs.chunker import ChunkerConfigFactory @@ -44,6 +44,8 @@ def parse_datetime(cls, value): class SimpleStructMemReaderConfig(BaseMemReaderConfig): """SimpleStruct MemReader configuration class.""" + model_config = ConfigDict(extra="allow", strict=True) + class MultiModalStructMemReaderConfig(BaseMemReaderConfig): """MultiModalStruct MemReader configuration class.""" @@ -58,6 +60,8 @@ class MultiModalStructMemReaderConfig(BaseMemReaderConfig): class StrategyStructMemReaderConfig(BaseMemReaderConfig): """StrategyStruct MemReader configuration class.""" + model_config = ConfigDict(extra="allow", strict=True) + class MemReaderConfigFactory(BaseConfig): """Factory class for creating MemReader configurations.""" From 5e23d3792fd4797b88ad487c8b813674eb2c45aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Thu, 4 Dec 2025 18:38:21 +0800 Subject: [PATCH 5/6] feat: update language detaction in string-fine of multi-modal-struct --- src/memos/mem_reader/multi_modal_struct.py | 67 +++++++++++++++++++++- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/src/memos/mem_reader/multi_modal_struct.py b/src/memos/mem_reader/multi_modal_struct.py index 7da013b48..ff14e5bd9 100644 --- a/src/memos/mem_reader/multi_modal_struct.py +++ b/src/memos/mem_reader/multi_modal_struct.py @@ -8,7 +8,7 @@ from memos.configs.mem_reader import MultiModalStructMemReaderConfig from memos.context.context import ContextThreadPoolExecutor from memos.mem_reader.read_multi_modal import MultiModalParser, detect_lang -from memos.mem_reader.simple_struct import SimpleStructMemReader +from memos.mem_reader.simple_struct import PROMPT_DICT, SimpleStructMemReader from memos.memories.textual.item import TextualMemoryItem from memos.templates.tool_mem_prompts import TOOL_TRAJECTORY_PROMPT_EN, TOOL_TRAJECTORY_PROMPT_ZH from memos.types import MessagesType @@ -248,6 +248,69 @@ def _build_window_from_items( return aggregated_item + def _get_llm_response( + self, mem_str: str, custom_tags: list[str] | None = None, sources: list | None = None + ) -> dict: + """ + Override parent method to improve language detection by using actual text content + from sources instead of JSON-structured memory string. + + Args: + mem_str: Memory string (may contain JSON structures) + custom_tags: Optional custom tags + sources: Optional list of SourceMessage objects to extract text content from + + Returns: + LLM response dictionary + """ + # Try to extract actual text content from sources for better language detection + text_for_lang_detection = mem_str + if sources: + source_texts = [] + for source in sources: + if hasattr(source, "content") and source.content: + source_texts.append(source.content) + elif isinstance(source, dict) and source.get("content"): + source_texts.append(source.get("content")) + + # If we have text content from sources, use it for language detection + if source_texts: + text_for_lang_detection = " ".join(source_texts) + + # Use the extracted text for language detection + lang = detect_lang(text_for_lang_detection) + template = PROMPT_DICT["chat"][lang] + examples = PROMPT_DICT["chat"][f"{lang}_example"] + prompt = template.replace("${conversation}", mem_str) + + custom_tags_prompt = ( + PROMPT_DICT["custom_tags"][lang].replace("{custom_tags}", str(custom_tags)) + if custom_tags + else "" + ) + prompt = prompt.replace("${custom_tags_prompt}", custom_tags_prompt) + + if self.config.remove_prompt_example: + prompt = prompt.replace(examples, "") + messages = [{"role": "user", "content": prompt}] + try: + response_text = self.llm.generate(messages) + response_json = self.parse_json_result(response_text) + except Exception as e: + logger.error(f"[LLM] Exception during chat generation: {e}") + response_json = { + "memory list": [ + { + "key": mem_str[:10], + "memory_type": "UserMemory", + "value": mem_str, + "tags": [], + } + ], + "summary": mem_str, + } + return response_json + def _process_string_fine( self, fast_memory_items: list[TextualMemoryItem], @@ -271,7 +334,7 @@ def _process_string_fine( if not isinstance(sources, list): sources = [sources] try: - resp = self._get_llm_response(mem_str, custom_tags) + resp = self._get_llm_response(mem_str, custom_tags, sources) except Exception as e: logger.error(f"[MultiModalFine] Error calling LLM: {e}") continue From 34cc741a5e5eafeb2b62e2e6e4d5d7d58e473f1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Thu, 4 Dec 2025 19:40:19 +0800 Subject: [PATCH 6/6] feat: add language detection --- src/memos/mem_reader/multi_modal_struct.py | 53 +++++++++++++++++++--- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/src/memos/mem_reader/multi_modal_struct.py b/src/memos/mem_reader/multi_modal_struct.py index ff14e5bd9..0cb4e1542 100644 --- a/src/memos/mem_reader/multi_modal_struct.py +++ b/src/memos/mem_reader/multi_modal_struct.py @@ -249,7 +249,11 @@ def _build_window_from_items( return aggregated_item def _get_llm_response( - self, mem_str: str, custom_tags: list[str] | None = None, sources: list | None = None + self, + mem_str: str, + custom_tags: list[str] | None = None, + sources: list | None = None, + prompt_type: str = "chat", ) -> dict: """ Override parent method to improve language detection by using actual text content @@ -259,6 +263,7 @@ def _get_llm_response( mem_str: Memory string (may contain JSON structures) custom_tags: Optional custom tags sources: Optional list of SourceMessage objects to extract text content from + prompt_type: Type of prompt to use ("chat" or "doc") Returns: LLM response dictionary @@ -279,18 +284,30 @@ def _get_llm_response( # Use the extracted text for language detection lang = detect_lang(text_for_lang_detection) - template = PROMPT_DICT["chat"][lang] - examples = PROMPT_DICT["chat"][f"{lang}_example"] - prompt = template.replace("${conversation}", mem_str) + + # Select prompt template based on prompt_type + if prompt_type == "doc": + template = PROMPT_DICT["doc"][lang] + examples = "" # doc prompts don't have examples + prompt = template.replace("{chunk_text}", mem_str) + else: + template = PROMPT_DICT["chat"][lang] + examples = PROMPT_DICT["chat"][f"{lang}_example"] + prompt = template.replace("${conversation}", mem_str) custom_tags_prompt = ( PROMPT_DICT["custom_tags"][lang].replace("{custom_tags}", str(custom_tags)) if custom_tags else "" ) - prompt = prompt.replace("${custom_tags_prompt}", custom_tags_prompt) - if self.config.remove_prompt_example: + # Replace custom_tags_prompt placeholder (different for doc vs chat) + if prompt_type == "doc": + prompt = prompt.replace("{custom_tags_prompt}", custom_tags_prompt) + else: + prompt = prompt.replace("${custom_tags_prompt}", custom_tags_prompt) + + if self.config.remove_prompt_example and examples: prompt = prompt.replace(examples, "") messages = [{"role": "user", "content": prompt}] try: @@ -311,6 +328,24 @@ def _get_llm_response( } return response_json + def _determine_prompt_type(self, sources: list) -> str: + """ + Determine prompt type based on sources. + """ + if not sources: + return "chat" + prompt_type = "doc" + for source in sources: + source_role = None + if hasattr(source, "role"): + source_role = source.role + elif isinstance(source, dict): + source_role = source.get("role") + if source_role in {"user", "assistant", "system", "tool"}: + prompt_type = "chat" + + return prompt_type + def _process_string_fine( self, fast_memory_items: list[TextualMemoryItem], @@ -333,8 +368,12 @@ def _process_string_fine( sources = fast_item.metadata.sources or [] if not isinstance(sources, list): sources = [sources] + + # Determine prompt type based on sources + prompt_type = self._determine_prompt_type(sources) + try: - resp = self._get_llm_response(mem_str, custom_tags, sources) + resp = self._get_llm_response(mem_str, custom_tags, sources, prompt_type) except Exception as e: logger.error(f"[MultiModalFine] Error calling LLM: {e}") continue