diff --git a/src/memos/mem_feedback/feedback.py b/src/memos/mem_feedback/feedback.py index eed43d66e..49fd382a0 100644 --- a/src/memos/mem_feedback/feedback.py +++ b/src/memos/mem_feedback/feedback.py @@ -17,7 +17,7 @@ from memos.mem_feedback.base import BaseMemFeedback from memos.mem_feedback.utils import should_keep_update, split_into_chunks from memos.mem_reader.factory import MemReaderFactory -from memos.mem_reader.simple_struct import detect_lang +from memos.mem_reader.read_multi_modal import detect_lang from memos.memories.textual.item import TextualMemoryItem, TreeNodeTextualMemoryMetadata from memos.memories.textual.tree_text_memory.organize.manager import ( MemoryManager, diff --git a/src/memos/mem_reader/multi_modal_struct.py b/src/memos/mem_reader/multi_modal_struct.py index e0aa40913..7da013b48 100644 --- a/src/memos/mem_reader/multi_modal_struct.py +++ b/src/memos/mem_reader/multi_modal_struct.py @@ -7,8 +7,8 @@ from memos import log from memos.configs.mem_reader import MultiModalStructMemReaderConfig from memos.context.context import ContextThreadPoolExecutor -from memos.mem_reader.read_multi_modal import MultiModalParser -from memos.mem_reader.simple_struct import SimpleStructMemReader, detect_lang +from memos.mem_reader.read_multi_modal import MultiModalParser, detect_lang +from memos.mem_reader.simple_struct import SimpleStructMemReader from memos.memories.textual.item import TextualMemoryItem from memos.templates.tool_mem_prompts import TOOL_TRAJECTORY_PROMPT_EN, TOOL_TRAJECTORY_PROMPT_ZH from memos.types import MessagesType diff --git a/src/memos/mem_reader/read_multi_modal/__init__.py b/src/memos/mem_reader/read_multi_modal/__init__.py index 3ac074226..925afa3ec 100644 --- a/src/memos/mem_reader/read_multi_modal/__init__.py +++ b/src/memos/mem_reader/read_multi_modal/__init__.py @@ -23,7 +23,7 @@ from .text_content_parser import TextContentParser from .tool_parser import ToolParser from .user_parser import UserParser -from .utils import coerce_scene_data, extract_role +from .utils import coerce_scene_data, detect_lang, extract_role __all__ = [ @@ -38,5 +38,6 @@ "ToolParser", "UserParser", "coerce_scene_data", + "detect_lang", "extract_role", ] diff --git a/src/memos/mem_reader/read_multi_modal/image_parser.py b/src/memos/mem_reader/read_multi_modal/image_parser.py index 610bc122f..88991fbe7 100644 --- a/src/memos/mem_reader/read_multi_modal/image_parser.py +++ b/src/memos/mem_reader/read_multi_modal/image_parser.py @@ -1,14 +1,23 @@ """Parser for image_url content parts.""" +import json +import re + from typing import Any from memos.embedders.base import BaseEmbedder from memos.llms.base import BaseLLM from memos.log import get_logger -from memos.memories.textual.item import SourceMessage, TextualMemoryItem +from memos.memories.textual.item import ( + SourceMessage, + TextualMemoryItem, + TreeNodeTextualMemoryMetadata, +) +from memos.templates.mem_reader_prompts import IMAGE_ANALYSIS_PROMPT_EN, IMAGE_ANALYSIS_PROMPT_ZH from memos.types.openai_chat_completion_types import ChatCompletionContentPartImageParam -from .base import BaseMessageParser +from .base import BaseMessageParser, _derive_key +from .utils import detect_lang logger = get_logger(__name__) @@ -43,7 +52,7 @@ def create_source( detail = "auto" return SourceMessage( type="image", - content=f"[image_url]: {url}", + content=url, original_part=message, url=url, detail=detail, @@ -87,7 +96,262 @@ def parse_fine( info: dict[str, Any], **kwargs, ) -> list[TextualMemoryItem]: - """Parse image_url in fine mode - placeholder for future vision model integration.""" - # Fine mode processing would use vision models to extract text from images - # For now, return empty list - return [] + """ + Parse image_url in fine mode using vision models to extract information from images. + + Args: + message: Image message to parse + info: Dictionary containing user_id and session_id + **kwargs: Additional parameters (e.g., context_items, custom_tags) + + Returns: + List of TextualMemoryItem objects extracted from the image + """ + if not self.llm: + logger.warning("[ImageParser] LLM not available for fine mode processing") + return [] + + # Extract image information + if not isinstance(message, dict): + logger.warning(f"[ImageParser] Expected dict, got {type(message)}") + return [] + + image_url = message.get("image_url", {}) + if isinstance(image_url, dict): + url = image_url.get("url", "") + detail = image_url.get("detail", "auto") + else: + url = str(image_url) + detail = "auto" + + if not url: + logger.warning("[ImageParser] No image URL found in message") + return [] + + # Create source for this image + source = self.create_source(message, info) + + # Get context items if available + context_items = kwargs.get("context_items") + + # Determine language from context if available + lang = "en" + if context_items: + for item in context_items: + if hasattr(item, "memory") and item.memory: + lang = detect_lang(item.memory) + break + + # Select prompt based on language + image_analysis_prompt = ( + IMAGE_ANALYSIS_PROMPT_ZH if lang == "zh" else IMAGE_ANALYSIS_PROMPT_EN + ) + + # Build messages with image content + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": image_analysis_prompt}, + { + "type": "image_url", + "image_url": { + "url": url, + "detail": detail, + }, + }, + ], + } + ] + + # Add context if available + if context_items: + context_text = "" + for item in context_items: + if hasattr(item, "memory") and item.memory: + context_text += f"{item.memory}\n" + if context_text: + messages.insert( + 0, + { + "role": "system", + "content": f"Context from previous conversation:\n{context_text}", + }, + ) + + try: + # Call LLM with vision model + response_text = self.llm.generate(messages) + if not response_text: + logger.warning("[ImageParser] Empty response from LLM") + return [] + + # Parse JSON response + response_json = self._parse_json_result(response_text) + + # Extract memory items from response + memory_items = [] + memory_list = response_json.get("memory list", []) + + if not memory_list: + logger.warning("[ImageParser] No memory items extracted from image") + # Fallback: create a simple memory item with the summary + summary = response_json.get( + "summary", "Image analyzed but no specific memories extracted." + ) + if summary: + memory_items.append( + self._create_memory_item( + value=summary, + info=info, + memory_type="LongTermMemory", + tags=["image", "visual"], + key=_derive_key(summary), + sources=[source], + background=summary, + ) + ) + return memory_items + + # Create memory items from parsed response + for mem_data in memory_list: + try: + # Normalize memory_type + memory_type = ( + mem_data.get("memory_type", "LongTermMemory") + .replace("长期记忆", "LongTermMemory") + .replace("用户记忆", "UserMemory") + ) + if memory_type not in ["LongTermMemory", "UserMemory"]: + memory_type = "LongTermMemory" + + value = mem_data.get("value", "").strip() + if not value: + continue + + tags = mem_data.get("tags", []) + if not isinstance(tags, list): + tags = [] + # Add image-related tags + if "image" not in [t.lower() for t in tags]: + tags.append("image") + if "visual" not in [t.lower() for t in tags]: + tags.append("visual") + + key = mem_data.get("key", "") + background = response_json.get("summary", "") + + memory_item = self._create_memory_item( + value=value, + info=info, + memory_type=memory_type, + tags=tags, + key=key if key else _derive_key(value), + sources=[source], + background=background, + ) + memory_items.append(memory_item) + except Exception as e: + logger.error(f"[ImageParser] Error creating memory item: {e}") + continue + + return memory_items + + except Exception as e: + logger.error(f"[ImageParser] Error processing image in fine mode: {e}") + # Fallback: create a simple memory item + fallback_value = f"Image analyzed: {url}" + return [ + self._create_memory_item( + value=fallback_value, + info=info, + memory_type="LongTermMemory", + tags=["image", "visual"], + key=_derive_key(fallback_value), + sources=[source], + background="Image processing encountered an error.", + ) + ] + + def _parse_json_result(self, response_text: str) -> dict: + """ + Parse JSON result from LLM response. + Similar to SimpleStructMemReader.parse_json_result. + """ + s = (response_text or "").strip() + + # Try to extract JSON from code blocks + m = re.search(r"```(?:json)?\s*([\s\S]*?)```", s, flags=re.I) + s = (m.group(1) if m else s.replace("```", "")).strip() + + # Find first { + i = s.find("{") + if i == -1: + return {} + s = s[i:].strip() + + try: + return json.loads(s) + except json.JSONDecodeError: + pass + + # Try to find the last } or ] + j = max(s.rfind("}"), s.rfind("]")) + if j != -1: + try: + return json.loads(s[: j + 1]) + except json.JSONDecodeError: + pass + + # Try to close brackets + def _cheap_close(t: str) -> str: + t += "}" * max(0, t.count("{") - t.count("}")) + t += "]" * max(0, t.count("[") - t.count("]")) + return t + + t = _cheap_close(s) + try: + return json.loads(t) + except json.JSONDecodeError as e: + if "Invalid \\escape" in str(e): + s = s.replace("\\", "\\\\") + try: + return json.loads(s) + except json.JSONDecodeError: + pass + logger.error(f"[ImageParser] Failed to parse JSON: {e}\nResponse: {response_text}") + return {} + + def _create_memory_item( + self, + value: str, + info: dict[str, Any], + memory_type: str, + tags: list[str], + key: str, + sources: list[SourceMessage], + background: str = "", + ) -> TextualMemoryItem: + """Create a TextualMemoryItem with the given parameters.""" + info_ = info.copy() + user_id = info_.pop("user_id", "") + session_id = info_.pop("session_id", "") + + return TextualMemoryItem( + memory=value, + metadata=TreeNodeTextualMemoryMetadata( + user_id=user_id, + session_id=session_id, + memory_type=memory_type, + status="activated", + tags=tags, + key=key, + embedding=self.embedder.embed([value])[0], + usage=[], + sources=sources, + background=background, + confidence=0.99, + type="fact", + info=info_, + ), + ) diff --git a/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py b/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py index d00639005..a135d7fd2 100644 --- a/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py +++ b/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py @@ -226,6 +226,8 @@ def process_transfer( parser = self.file_content_parser elif source.type == "text": parser = self.text_content_parser + elif source.type in ["image", "image_url"]: + parser = self.image_parser elif source.role: # Chat message, use role parser parser = self.role_parsers.get(source.role) diff --git a/src/memos/mem_reader/read_multi_modal/user_parser.py b/src/memos/mem_reader/read_multi_modal/user_parser.py index 8cf667a4b..c7b8ad4e9 100644 --- a/src/memos/mem_reader/read_multi_modal/user_parser.py +++ b/src/memos/mem_reader/read_multi_modal/user_parser.py @@ -85,8 +85,20 @@ def create_source( original_part=part, ) ) + elif part_type == "image_url": + image_info = part.get("image_url", {}) + sources.append( + SourceMessage( + type="image", + role=role, + chat_time=chat_time, + message_id=message_id, + image_path=image_info.get("url"), + original_part=part, + ) + ) else: - # image_url, input_audio, etc. + # input_audio, etc. sources.append( SourceMessage( type=part_type, diff --git a/src/memos/mem_reader/read_multi_modal/utils.py b/src/memos/mem_reader/read_multi_modal/utils.py index 992011765..9582a258c 100644 --- a/src/memos/mem_reader/read_multi_modal/utils.py +++ b/src/memos/mem_reader/read_multi_modal/utils.py @@ -337,3 +337,34 @@ def coerce_scene_data(scene_data: SceneDataInput, scene_type: str) -> list[Messa # fallback return [str(scene_data)] + + +def detect_lang(text): + """ + Detect the language of the given text (Chinese or English). + + Args: + text: Text to analyze + + Returns: + "zh" for Chinese, "en" for English (default) + """ + try: + if not text or not isinstance(text, str): + return "en" + cleaned_text = text + # remove role and timestamp + cleaned_text = re.sub( + r"\b(user|assistant|query|answer)\s*:", "", cleaned_text, flags=re.IGNORECASE + ) + cleaned_text = re.sub(r"\[[\d\-:\s]+\]", "", cleaned_text) + + # extract chinese characters + chinese_pattern = r"[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f\U0002b820-\U0002ceaf\uf900-\ufaff]" + chinese_chars = re.findall(chinese_pattern, cleaned_text) + text_without_special = re.sub(r"[\s\d\W]", "", cleaned_text) + if text_without_special and len(chinese_chars) / len(text_without_special) > 0.3: + return "zh" + return "en" + except Exception: + return "en" diff --git a/src/memos/mem_reader/simple_struct.py b/src/memos/mem_reader/simple_struct.py index 7f7b16234..f43ad01ba 100644 --- a/src/memos/mem_reader/simple_struct.py +++ b/src/memos/mem_reader/simple_struct.py @@ -16,7 +16,7 @@ from memos.embedders.factory import EmbedderFactory from memos.llms.factory import LLMFactory from memos.mem_reader.base import BaseMemReader -from memos.mem_reader.read_multi_modal import coerce_scene_data +from memos.mem_reader.read_multi_modal import coerce_scene_data, detect_lang from memos.memories.textual.item import ( SourceMessage, TextualMemoryItem, @@ -101,28 +101,6 @@ def _count_tokens_text(s: str) -> int: return zh + max(1, rest // 4) -def detect_lang(text): - try: - if not text or not isinstance(text, str): - return "en" - cleaned_text = text - # remove role and timestamp - cleaned_text = re.sub( - r"\b(user|assistant|query|answer)\s*:", "", cleaned_text, flags=re.IGNORECASE - ) - cleaned_text = re.sub(r"\[[\d\-:\s]+\]", "", cleaned_text) - - # extract chinese characters - chinese_pattern = r"[\u4e00-\u9fff\u3400-\u4dbf\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f\U0002b820-\U0002ceaf\uf900-\ufaff]" - chinese_chars = re.findall(chinese_pattern, cleaned_text) - text_without_special = re.sub(r"[\s\d\W]", "", cleaned_text) - if text_without_special and len(chinese_chars) / len(text_without_special) > 0.3: - return "zh" - return "en" - except Exception: - return "en" - - def _build_node(idx, message, info, source_info, llm, parse_json_result, embedder): # generate try: diff --git a/src/memos/mem_reader/strategy_struct.py b/src/memos/mem_reader/strategy_struct.py index 21be8bc39..d550d89e9 100644 --- a/src/memos/mem_reader/strategy_struct.py +++ b/src/memos/mem_reader/strategy_struct.py @@ -5,7 +5,8 @@ from memos import log from memos.configs.mem_reader import StrategyStructMemReaderConfig from memos.configs.parser import ParserConfigFactory -from memos.mem_reader.simple_struct import SimpleStructMemReader, detect_lang +from memos.mem_reader.read_multi_modal import detect_lang +from memos.mem_reader.simple_struct import SimpleStructMemReader from memos.parsers.factory import ParserFactory from memos.templates.mem_reader_prompts import ( CUSTOM_TAGS_INSTRUCTION, diff --git a/src/memos/memories/textual/prefer_text_memory/extractor.py b/src/memos/memories/textual/prefer_text_memory/extractor.py index e105500bd..144bfad7f 100644 --- a/src/memos/memories/textual/prefer_text_memory/extractor.py +++ b/src/memos/memories/textual/prefer_text_memory/extractor.py @@ -8,7 +8,7 @@ from memos.context.context import ContextThreadPoolExecutor from memos.log import get_logger -from memos.mem_reader.simple_struct import detect_lang +from memos.mem_reader.read_multi_modal import detect_lang from memos.memories.textual.item import ( PreferenceTextualMemoryMetadata, TextualMemoryItem, diff --git a/src/memos/multi_mem_cube/single_cube.py b/src/memos/multi_mem_cube/single_cube.py index f9e084347..f1c01e26e 100644 --- a/src/memos/multi_mem_cube/single_cube.py +++ b/src/memos/multi_mem_cube/single_cube.py @@ -556,7 +556,7 @@ def _process_pref_mem( return [] for message in add_req.messages: - if message.get("role", None) is None: + if isinstance(message, dict) and message.get("role", None) is None: return [] target_session_id = add_req.session_id or "default_session" diff --git a/src/memos/templates/instruction_completion.py b/src/memos/templates/instruction_completion.py index b88ff474c..74a20ecff 100644 --- a/src/memos/templates/instruction_completion.py +++ b/src/memos/templates/instruction_completion.py @@ -1,6 +1,6 @@ from typing import Any -from memos.mem_reader.simple_struct import detect_lang +from memos.mem_reader.read_multi_modal import detect_lang from memos.templates.prefer_complete_prompt import PREF_INSTRUCTIONS, PREF_INSTRUCTIONS_ZH diff --git a/src/memos/templates/mem_reader_prompts.py b/src/memos/templates/mem_reader_prompts.py index 3223e4694..50afb86f2 100644 --- a/src/memos/templates/mem_reader_prompts.py +++ b/src/memos/templates/mem_reader_prompts.py @@ -359,3 +359,61 @@ CUSTOM_TAGS_INSTRUCTION_ZH = """输出tags可以参考下列标签: {custom_tags} 你可以选择与memory相关的在上述列表中可以加入tags,同时你可以根据memory的内容自由添加tags。""" + + +IMAGE_ANALYSIS_PROMPT_EN = """You are an intelligent memory assistant. Analyze the provided image and extract meaningful information that should be remembered. + +Please extract: +1. **Visual Content**: What objects, people, scenes, or text are visible in the image? +2. **Context**: What is the context or situation depicted? +3. **Key Information**: What important details, facts, or information can be extracted? +4. **User Relevance**: What aspects of this image might be relevant to the user's memory? + +Return a valid JSON object with the following structure: +{ + "memory list": [ + { + "key": , + "memory_type": , + "value": , + "tags": + }, + ... + ], + "summary": +} + +Language rules: +- The `key`, `value`, `tags`, `summary` and `memory_type` fields should match the language of the user's context if available, otherwise use English. +- Keep `memory_type` in English. + +Focus on extracting factual, observable information from the image. Avoid speculation unless clearly relevant to user memory.""" + + +IMAGE_ANALYSIS_PROMPT_ZH = """您是一个智能记忆助手。请分析提供的图像并提取应该被记住的有意义信息。 + +请提取: +1. **视觉内容**:图像中可见的物体、人物、场景或文字是什么? +2. **上下文**:图像描绘了什么情境或情况? +3. **关键信息**:可以提取哪些重要的细节、事实或信息? +4. **用户相关性**:图像的哪些方面可能与用户的记忆相关? + +返回一个有效的 JSON 对象,格式如下: +{ + "memory list": [ + { + "key": <字符串,一个唯一且简洁的记忆标题>, + "memory_type": <字符串,"LongTermMemory" 或 "UserMemory">, + "value": <一个详细、自包含的描述,说明应该从图像中记住什么>, + "tags": <相关关键词列表(例如:["图像", "视觉", "场景", "物体"])> + }, + ... + ], + "summary": <一个自然段落,总结图像内容,120-200字> +} + +语言规则: +- `key`、`value`、`tags`、`summary` 和 `memory_type` 字段应该与用户上下文的语言匹配(如果可用),否则使用中文。 +- `memory_type` 保持英文。 + +专注于从图像中提取事实性、可观察的信息。除非与用户记忆明显相关,否则避免推测。"""