From 29f64df0291b5f1fbd40d08c290cf81d44318d1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Tue, 2 Dec 2025 15:04:21 +0800 Subject: [PATCH 1/7] feat: update file_content_parser fine --- .../read_multi_modal/file_content_parser.py | 165 +++++++++++++++++- 1 file changed, 164 insertions(+), 1 deletion(-) diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 12b44eae8..59ab914f7 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -1,6 +1,9 @@ """Parser for file content parts (RawMessageList).""" +import os + from typing import Any +from urllib.parse import urlparse from memos.embedders.base import BaseEmbedder from memos.llms.base import BaseLLM @@ -237,4 +240,164 @@ def parse_fine( info: dict[str, Any], **kwargs, ) -> list[TextualMemoryItem]: - return [] + """ + Parse file content part in fine mode. + Fine mode downloads and parses file content, especially for URLs. + Handles various file parameter scenarios: + - file_data: URL (http://, https://, or @http://), base64 encoded data, or plain text content + - file_id: ID of an uploaded file + - filename: name of the file + """ + if not isinstance(message, dict): + logger.warning(f"[FileContentParser] Expected dict, got {type(message)}") + return [] + + # Extract file information + file_info = message.get("file", {}) + if not isinstance(file_info, dict): + logger.warning(f"[FileContentParser] Expected file dict, got {type(file_info)}") + return [] + + # Extract file parameters (all are optional) + file_data = file_info.get("file_data", "") + file_id = file_info.get("file_id", "") + filename = file_info.get("filename", "") + + # Initialize parser if not already set + if not self.parser: + try: + from memos.configs.parser import ParserConfigFactory + + parser_config = ParserConfigFactory.model_validate( + { + "backend": "markitdown", + "config": {}, + } + ) + self.parser = ParserFactory.from_config(parser_config) + except Exception as e: + logger.warning(f"[FileContentParser] Failed to create parser: {e}") + return [] + + parsed_text = "" + temp_file_path = None + + try: + # Priority 1: If file_data is provided, process it + if file_data: + if isinstance(file_data, str): + # Check if it's a URL (supports @http://, http://, https://) + url_str = file_data + if url_str.startswith("@"): + url_str = url_str[1:] # Remove @ prefix if present + + if url_str.startswith(("http://", "https://")): + # Download and parse URL + try: + import requests + + # Parse URL to check hostname + parsed_url = urlparse(url_str) + hostname = parsed_url.hostname or "" + + logger.info(f"[FileContentParser] Downloading file from URL: {url_str}") + response = requests.get(url_str, timeout=30) + response.raise_for_status() + + # Determine filename from URL or use provided filename + if not filename: + filename = os.path.basename(parsed_url.path) or "downloaded_file" + + # Route based on hostname + if hostname == "139.196.232.20": + # Special handling for 139.196.232.20: directly use response text as markdown + logger.info( + f"[FileContentParser] Using direct markdown content for {hostname}" + ) + parsed_text = response.text + else: + logger.warning("[FileContentParser] Outer url not implemented now.") + except requests.RequestException as e: + logger.error( + f"[FileContentParser] Failed to download URL {url_str}: {e}" + ) + parsed_text = f"[File URL download failed: {url_str}]" + except Exception as e: + logger.error(f"[FileContentParser] Error parsing downloaded file: {e}") + parsed_text = f"[File parsing error: {e!s}]" + + # Check if it's a local file path + elif os.path.exists(file_data): + logger.info("[FileContentParser] local file not implemented now.") + # Check if it's base64 encoded data + elif file_data.startswith("data:") or ( + len(file_data) > 100 + and all( + c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" + for c in file_data[:100] + ) + ): + logger.info("[FileContentParser] base64 not implemented now.") + # Otherwise treat as plain text + else: + parsed_text = file_data + + # Priority 2: If file_id is provided but no file_data, try to use file_id as path + elif file_id: + logger.warning(f"[FileContentParser] File data not provided for file_id: {file_id}") + parsed_text = f"[File ID: {file_id}]: File data not provided" + + # If no content could be parsed, create a placeholder + if not parsed_text: + if filename: + parsed_text = f"[File: {filename}]: File data not provided" + else: + parsed_text = "[File: unknown]: File data not provided" + + except Exception as e: + logger.error(f"[FileContentParser] Error in parse_fine: {e}") + parsed_text = f"[File parsing error: {e!s}]" + + finally: + # Clean up temporary file + if temp_file_path and os.path.exists(temp_file_path): + try: + os.unlink(temp_file_path) + logger.debug(f"[FileContentParser] Cleaned up temporary file: {temp_file_path}") + except Exception as e: + logger.warning( + f"[FileContentParser] Failed to delete temp file {temp_file_path}: {e}" + ) + + # Create source + source = self.create_source(message, info) + + # Extract info fields + info_ = info.copy() + user_id = info_.pop("user_id", "") + session_id = info_.pop("session_id", "") + + # For file content parts, default to LongTermMemory + memory_type = "LongTermMemory" + + # Create memory item with parsed content + memory_item = TextualMemoryItem( + memory=parsed_text, + metadata=TreeNodeTextualMemoryMetadata( + user_id=user_id, + session_id=session_id, + memory_type=memory_type, + status="activated", + tags=["mode:fine", "multimodal:file"], + key=_derive_key(parsed_text), + embedding=self.embedder.embed([parsed_text])[0], + usage=[], + sources=[source], + background="", + confidence=0.99, + type="fact", + info=info_, + ), + ) + + return [memory_item] From f8551ea91048abcb105da5ffb306da4c4af31d5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Tue, 2 Dec 2025 15:58:17 +0800 Subject: [PATCH 2/7] feat: add inner host --- .../mem_reader/multimodal_struct_reader.py | 107 +++++++++++++++++- src/memos/api/config.py | 5 + src/memos/configs/mem_reader.py | 6 + src/memos/mem_reader/multi_modal_struct.py | 9 +- .../read_multi_modal/file_content_parser.py | 37 +++++- .../read_multi_modal/multi_modal_parser.py | 8 +- 6 files changed, 165 insertions(+), 7 deletions(-) diff --git a/examples/mem_reader/multimodal_struct_reader.py b/examples/mem_reader/multimodal_struct_reader.py index 20c141828..ec4f58dae 100644 --- a/examples/mem_reader/multimodal_struct_reader.py +++ b/examples/mem_reader/multimodal_struct_reader.py @@ -327,6 +327,102 @@ def get_info(self) -> dict[str, Any]: ] ], ), + TestCase( + name="oss_text_file", + description="User message with text and file", + scene_data=[ + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "请阅读这个PDF,总结里面的要点。"}, + { + "type": "file", + "file": { + "file_id": "file_123", + "filename": "report.pdf", + "file_data": "@http://139.196.232.20:9090/graph-test/algorithm/2025_11_13/1763043889_1763043782_PM1%E8%BD%A6%E9%97%B4PMT%E9%9D%B4%E5%8E%8B%E8%BE%B9%E5%8E%8B%E5%8E%8B%E5%8A%9B%E6%97%A0%E6%B3%95%E5%BB%BA%E7%AB%8B%E6%95%85%E9%9A%9C%E6%8A%A5%E5%91%8A20240720.md", + }, + }, + ], + "chat_time": "2025-11-24T10:21:00Z", + "message_id": "mm-file-1", + } + ] + ], + ), + TestCase( + name="pure_data_file", + description="User message with text and file", + scene_data=[ + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "请阅读这个PDF,总结里面的要点。"}, + { + "type": "file", + "file": { + "file_id": "file_123", + "filename": "report.pdf", + "file_data": "明文记忆是系统与用户对话、操作等交互中动态习得,以及外部提供的、可显式管理的结构化知识形态,通常以文档、提示模板、图结构或用户规则等形式存在。它具备编辑性、可共享性与治理友好性,适合存储需要频繁修改、可审计或多方协同使用的信息。 在 MemOS 中,明文记忆可用于动态生成推理上下文、个性化偏好注入、多代理协作共享等场景,成为连接人类输入与模型认知的关键桥梁。激活记忆是指模型在推理过程中产生的瞬时性认知状态,包括 KV cache、隐藏层激活、注意力权重等中间张量结构。它通常用于维持上下文连续性、对话一致性与行为风格控制。 MemOS 将激活记忆抽象为可调度资源,支持按需唤醒、延迟卸载与结构变换。例如,某些上下文状态可以被压缩为“半结构化记忆片段”用于未来复用,也可以在任务级别转化为参数化模块,支持短期记忆的长期化演进。这一机制为模型行为一致性、风格保持与状态持续性提供了基础。", + }, + }, + ], + "chat_time": "2025-11-24T10:21:00Z", + "message_id": "mm-file-1", + } + ] + ], + ), + TestCase( + name="local_data_file", + description="User message with text and file", + scene_data=[ + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "请阅读这个PDF,总结里面的要点。"}, + { + "type": "file", + "file": { + "file_id": "file_123", + "filename": "report.pdf", + "file_data": "./my_local_file/report.pdf", + }, + }, + ], + "chat_time": "2025-11-24T10:21:00Z", + "message_id": "mm-file-1", + } + ] + ], + ), + TestCase( + name="internet_file", + description="User message with text and file", + scene_data=[ + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "请阅读这个PDF,总结里面的要点。"}, + { + "type": "file", + "file": { + "file_id": "file_123", + "filename": "report.pdf", + "file_data": "https://upload.wikimedia.org/wikipedia/commons/c/cb/NLC416-16jh004830-88775_%E7%B4%85%E6%A8%93%E5%A4%A2.pdf", + }, + }, + ], + "chat_time": "2025-11-24T10:21:00Z", + "message_id": "mm-file-1", + } + ] + ], + ), TestCase( name="multimodal_mixed", description="Mixed multimodal message (text + file + image)", @@ -661,6 +757,12 @@ def get_reader_config() -> dict[str, Any]: }, } + # Get direct markdown hostnames from environment variable + direct_markdown_hostnames = None + env_hostnames = os.getenv("FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES", "") + if env_hostnames: + direct_markdown_hostnames = [h.strip() for h in env_hostnames.split(",") if h.strip()] + return { "llm": llm_config, "embedder": embedder_config, @@ -673,6 +775,7 @@ def get_reader_config() -> dict[str, Any]: "min_sentences_per_chunk": 1, }, }, + "direct_markdown_hostnames": direct_markdown_hostnames, } @@ -863,13 +966,13 @@ def main(): parser.add_argument( "--example", type=str, - default="all", + default="oss_text_file", help="Test case name, category name, or 'all' to run all cases (default: all)", ) parser.add_argument( "--mode", choices=["fast", "fine"], - default="fast", + default="fine", help="Processing mode: fast (quick) or fine (with LLM) (default: fast)", ) parser.add_argument( diff --git a/src/memos/api/config.py b/src/memos/api/config.py index 535811c42..46d6c903f 100644 --- a/src/memos/api/config.py +++ b/src/memos/api/config.py @@ -707,6 +707,11 @@ def get_product_default_config() -> dict[str, Any]: }, }, "chat_chunker": reader_config, + "direct_markdown_hostnames": [ + h.strip() + for h in os.getenv("FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES", "").split(",") + if h.strip() + ], }, }, "enable_textual_memory": True, diff --git a/src/memos/configs/mem_reader.py b/src/memos/configs/mem_reader.py index 34693ea68..9b9bee701 100644 --- a/src/memos/configs/mem_reader.py +++ b/src/memos/configs/mem_reader.py @@ -48,6 +48,12 @@ class SimpleStructMemReaderConfig(BaseMemReaderConfig): class MultiModalStructMemReaderConfig(BaseMemReaderConfig): """MultiModalStruct MemReader configuration class.""" + direct_markdown_hostnames: list[str] | None = Field( + default=None, + description="List of hostnames that should return markdown directly without parsing. " + "If None, reads from FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES environment variable.", + ) + class StrategyStructMemReaderConfig(BaseMemReaderConfig): """StrategyStruct MemReader configuration class.""" diff --git a/src/memos/mem_reader/multi_modal_struct.py b/src/memos/mem_reader/multi_modal_struct.py index 5a78208b9..94ffb5afc 100644 --- a/src/memos/mem_reader/multi_modal_struct.py +++ b/src/memos/mem_reader/multi_modal_struct.py @@ -29,7 +29,13 @@ def __init__(self, config: MultiModalStructMemReaderConfig): """ from memos.configs.mem_reader import SimpleStructMemReaderConfig + # Extract direct_markdown_hostnames before converting to SimpleStructMemReaderConfig + direct_markdown_hostnames = getattr(config, "direct_markdown_hostnames", None) + + # Create config_dict excluding direct_markdown_hostnames for SimpleStructMemReaderConfig config_dict = config.model_dump(exclude_none=True) + config_dict.pop("direct_markdown_hostnames", None) + simple_config = SimpleStructMemReaderConfig(**config_dict) super().__init__(simple_config) @@ -38,6 +44,7 @@ def __init__(self, config: MultiModalStructMemReaderConfig): embedder=self.embedder, llm=self.llm, parser=None, + direct_markdown_hostnames=direct_markdown_hostnames, ) def _concat_multi_modal_memories( @@ -271,7 +278,7 @@ def _process_multi_modal_data( sources = fast_item.metadata.sources for source in sources: items = self.multi_modal_parser.process_transfer( - source, context_items=[fast_item], custom_tags=custom_tags + source, context_items=[fast_item], custom_tags=custom_tags, info=info ) fine_memory_items.extend(items) return fine_memory_items diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 59ab914f7..c7a7cfeb8 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -1,6 +1,7 @@ """Parser for file content parts (RawMessageList).""" import os +import tempfile from typing import Any from urllib.parse import urlparse @@ -30,6 +31,7 @@ def __init__( embedder: BaseEmbedder, llm: BaseLLM | None = None, parser: Any | None = None, + direct_markdown_hostnames: list[str] | None = None, ): """ Initialize FileContentParser. @@ -38,10 +40,26 @@ def __init__( embedder: Embedder for generating embeddings llm: Optional LLM for fine mode processing parser: Optional parser for parsing file contents + direct_markdown_hostnames: List of hostnames that should return markdown directly + without parsing. If None, reads from FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES + environment variable (comma-separated). """ super().__init__(embedder, llm) self.parser = parser + # Get inner markdown hostnames from config or environment + if direct_markdown_hostnames is not None: + self.direct_markdown_hostnames = direct_markdown_hostnames + else: + env_hostnames = os.getenv("FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES", "") + if env_hostnames: + # Support comma-separated list + self.direct_markdown_hostnames = [ + h.strip() for h in env_hostnames.split(",") if h.strip() + ] + else: + self.direct_markdown_hostnames = [] + def create_source( self, message: File, @@ -309,14 +327,25 @@ def parse_fine( filename = os.path.basename(parsed_url.path) or "downloaded_file" # Route based on hostname - if hostname == "139.196.232.20": - # Special handling for 139.196.232.20: directly use response text as markdown + if hostname in self.direct_markdown_hostnames: + # Special handling for configured hostnames: directly use response text as markdown logger.info( f"[FileContentParser] Using direct markdown content for {hostname}" ) parsed_text = response.text else: - logger.warning("[FileContentParser] Outer url not implemented now.") + file_ext = os.path.splitext(filename)[1] or ".tmp" + + with tempfile.NamedTemporaryFile( + mode="wb", delete=False, suffix=file_ext + ) as temp_file: + temp_file.write(response.content) + temp_file_path = temp_file.name + logger.info( + f"[FileContentParser] Downloaded file to: {temp_file_path}" + ) + # Parse the downloaded file + parsed_text = self.parser.parse(temp_file_path) except requests.RequestException as e: logger.error( f"[FileContentParser] Failed to download URL {url_str}: {e}" @@ -373,6 +402,8 @@ def parse_fine( source = self.create_source(message, info) # Extract info fields + if not info: + info = {} info_ = info.copy() user_id = info_.pop("user_id", "") session_id = info_.pop("session_id", "") diff --git a/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py b/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py index 3c60c3143..d00639005 100644 --- a/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py +++ b/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py @@ -35,6 +35,7 @@ def __init__( embedder: BaseEmbedder, llm: BaseLLM | None = None, parser: Any | None = None, + direct_markdown_hostnames: list[str] | None = None, ): """ Initialize MultiModalParser. @@ -43,6 +44,9 @@ def __init__( embedder: Embedder for generating embeddings llm: Optional LLM for fine mode processing parser: Optional parser for parsing file contents + direct_markdown_hostnames: List of hostnames that should return markdown directly + without parsing. If None, reads from FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES + environment variable (comma-separated). Default: ["139.196.232.20"] """ self.embedder = embedder self.llm = llm @@ -55,7 +59,9 @@ def __init__( self.assistant_parser = AssistantParser(embedder, llm) self.tool_parser = ToolParser(embedder, llm) self.text_content_parser = TextContentParser(embedder, llm) - self.file_content_parser = FileContentParser(embedder, llm, parser) + self.file_content_parser = FileContentParser( + embedder, llm, parser, direct_markdown_hostnames=direct_markdown_hostnames + ) self.image_parser = ImageParser(embedder, llm) self.audio_parser = None # future From dccab1f320a94b15ebf49668ba2514dd5fb180a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Tue, 2 Dec 2025 16:04:04 +0800 Subject: [PATCH 3/7] feat: add default inner reader ip --- examples/mem_reader/multimodal_struct_reader.py | 2 +- src/memos/api/config.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/mem_reader/multimodal_struct_reader.py b/examples/mem_reader/multimodal_struct_reader.py index ec4f58dae..790b13f85 100644 --- a/examples/mem_reader/multimodal_struct_reader.py +++ b/examples/mem_reader/multimodal_struct_reader.py @@ -759,7 +759,7 @@ def get_reader_config() -> dict[str, Any]: # Get direct markdown hostnames from environment variable direct_markdown_hostnames = None - env_hostnames = os.getenv("FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES", "") + env_hostnames = os.getenv("FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES", "139.196.232.20") if env_hostnames: direct_markdown_hostnames = [h.strip() for h in env_hostnames.split(",") if h.strip()] diff --git a/src/memos/api/config.py b/src/memos/api/config.py index 46d6c903f..af0f0473d 100644 --- a/src/memos/api/config.py +++ b/src/memos/api/config.py @@ -709,7 +709,9 @@ def get_product_default_config() -> dict[str, Any]: "chat_chunker": reader_config, "direct_markdown_hostnames": [ h.strip() - for h in os.getenv("FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES", "").split(",") + for h in os.getenv( + "FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES", "139.196.232.20" + ).split(",") if h.strip() ], }, From 2f35a147c6d3fc36e97139cc5a0c1a89270f746c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Tue, 2 Dec 2025 16:17:04 +0800 Subject: [PATCH 4/7] refactor: modify file_content_parser --- .../read_multi_modal/file_content_parser.py | 130 +++++++++--------- 1 file changed, 64 insertions(+), 66 deletions(-) diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index c7a7cfeb8..1d10ef25d 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -4,7 +4,6 @@ import tempfile from typing import Any -from urllib.parse import urlparse from memos.embedders.base import BaseEmbedder from memos.llms.base import BaseLLM @@ -26,6 +25,53 @@ class FileContentParser(BaseMessageParser): """Parser for file content parts.""" + def _handle_url(self, url_str: str, filename: str) -> tuple[str, str | None]: + """Download and parse file from URL.""" + try: + from urllib.parse import urlparse + + import requests + + parsed_url = urlparse(url_str) + hostname = parsed_url.hostname or "" + + response = requests.get(url_str, timeout=30) + response.raise_for_status() + + if not filename: + filename = os.path.basename(parsed_url.path) or "downloaded_file" + + if hostname in self.direct_markdown_hostnames: + return response.text, None + + file_ext = os.path.splitext(filename)[1] or ".tmp" + with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=file_ext) as temp_file: + temp_file.write(response.content) + return "", temp_file.name + except Exception as e: + logger.error(f"[FileContentParser] URL processing error: {e}") + return f"[File URL download failed: {url_str}]", None + + def _is_base64(self, data: str) -> bool: + """Quick heuristic to check base64-like string.""" + return data.startswith("data:") or ( + len(data) > 100 + and all( + c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" + for c in data[:100] + ) + ) + + def _handle_base64(self, data: str) -> str: + """Base64 not implemented placeholder.""" + logger.info("[FileContentParser] Base64 content detected but decoding is not implemented.") + return "" + + def _handle_local(self, data: str) -> str: + """Base64 not implemented placeholder.""" + logger.info("[FileContentParser] Local file paths are not supported in fine mode.") + return "" + def __init__( self, embedder: BaseEmbedder, @@ -128,8 +174,6 @@ def _parse_file(self, file_info: dict[str, Any]) -> str: return f"[File: {filename}]" try: - import os - if os.path.exists(file_path): parsed_text = self.parser.parse(file_path) return parsed_text @@ -304,73 +348,27 @@ def parse_fine( # Priority 1: If file_data is provided, process it if file_data: if isinstance(file_data, str): - # Check if it's a URL (supports @http://, http://, https://) - url_str = file_data - if url_str.startswith("@"): - url_str = url_str[1:] # Remove @ prefix if present + url_str = file_data[1:] if file_data.startswith("@") else file_data if url_str.startswith(("http://", "https://")): - # Download and parse URL - try: - import requests - - # Parse URL to check hostname - parsed_url = urlparse(url_str) - hostname = parsed_url.hostname or "" - - logger.info(f"[FileContentParser] Downloading file from URL: {url_str}") - response = requests.get(url_str, timeout=30) - response.raise_for_status() - - # Determine filename from URL or use provided filename - if not filename: - filename = os.path.basename(parsed_url.path) or "downloaded_file" - - # Route based on hostname - if hostname in self.direct_markdown_hostnames: - # Special handling for configured hostnames: directly use response text as markdown - logger.info( - f"[FileContentParser] Using direct markdown content for {hostname}" - ) - parsed_text = response.text - else: - file_ext = os.path.splitext(filename)[1] or ".tmp" - - with tempfile.NamedTemporaryFile( - mode="wb", delete=False, suffix=file_ext - ) as temp_file: - temp_file.write(response.content) - temp_file_path = temp_file.name - logger.info( - f"[FileContentParser] Downloaded file to: {temp_file_path}" - ) - # Parse the downloaded file + parsed_text, temp_file_path = self._handle_url(url_str, filename) + if temp_file_path: + try: parsed_text = self.parser.parse(temp_file_path) - except requests.RequestException as e: - logger.error( - f"[FileContentParser] Failed to download URL {url_str}: {e}" - ) - parsed_text = f"[File URL download failed: {url_str}]" - except Exception as e: - logger.error(f"[FileContentParser] Error parsing downloaded file: {e}") - parsed_text = f"[File parsing error: {e!s}]" - - # Check if it's a local file path + except Exception as e: + logger.error( + f"[FileContentParser] Error parsing downloaded file: {e}" + ) + parsed_text = f"[File parsing error: {e!s}]" + elif os.path.exists(file_data): - logger.info("[FileContentParser] local file not implemented now.") - # Check if it's base64 encoded data - elif file_data.startswith("data:") or ( - len(file_data) > 100 - and all( - c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" - for c in file_data[:100] - ) - ): - logger.info("[FileContentParser] base64 not implemented now.") - # Otherwise treat as plain text + parsed_text = self._handle_local(file_data) + + elif self._is_base64(file_data): + parsed_text = self._handle_base64(file_data) + else: parsed_text = file_data - # Priority 2: If file_id is provided but no file_data, try to use file_id as path elif file_id: logger.warning(f"[FileContentParser] File data not provided for file_id: {file_id}") @@ -379,9 +377,9 @@ def parse_fine( # If no content could be parsed, create a placeholder if not parsed_text: if filename: - parsed_text = f"[File: {filename}]: File data not provided" + parsed_text = f"[File: {filename}] File data not provided" else: - parsed_text = "[File: unknown]: File data not provided" + parsed_text = "[File: unknown] File data not provided" except Exception as e: logger.error(f"[FileContentParser] Error in parse_fine: {e}") From ff4dcdc2df68d77a3ac9ab96c50de50b4ec7d204 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Tue, 2 Dec 2025 16:20:55 +0800 Subject: [PATCH 5/7] feat: pass through parse when md/txt --- src/memos/mem_reader/read_multi_modal/file_content_parser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 1d10ef25d..457e4c42d 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -44,7 +44,9 @@ def _handle_url(self, url_str: str, filename: str) -> tuple[str, str | None]: if hostname in self.direct_markdown_hostnames: return response.text, None - file_ext = os.path.splitext(filename)[1] or ".tmp" + file_ext = os.path.splitext(filename)[1].lower() + if file_ext in [".md", ".markdown", ".txt"]: + return response.text, None with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=file_ext) as temp_file: temp_file.write(response.content) return "", temp_file.name From c955fd055f03e75aed9eaf9e521bc33c4f19a32a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Tue, 2 Dec 2025 16:39:17 +0800 Subject: [PATCH 6/7] feat: add text spliter and parser --- .../read_multi_modal/file_content_parser.py | 224 ++++++++++++------ .../mem_reader/read_multi_modal/utils.py | 52 ++++ 2 files changed, 202 insertions(+), 74 deletions(-) diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 457e4c42d..81b5a14c0 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -13,10 +13,10 @@ TextualMemoryItem, TreeNodeTextualMemoryMetadata, ) -from memos.parsers.factory import ParserFactory from memos.types.openai_chat_completion_types import File from .base import BaseMessageParser, _derive_key +from .utils import file_parser, text_splitter logger = get_logger(__name__) @@ -108,6 +108,32 @@ def __init__( else: self.direct_markdown_hostnames = [] + def _split_text(self, text: str) -> list[str]: + """ + Split text into chunks using langchain text splitter from utils. + + Args: + text: Text to split + + Returns: + List of text chunks + """ + if not text or not text.strip(): + return [] + + if not text_splitter: + # If text splitter is not available, return text as single chunk + return [text] if text.strip() else [] + + try: + chunks = text_splitter.split_text(text) + logger.debug(f"[FileContentParser] Split text into {len(chunks)} chunks") + return chunks + except Exception as e: + logger.error(f"[FileContentParser] Error splitting text: {e}") + # Fallback to single chunk + return [text] if text.strip() else [] + def create_source( self, message: File, @@ -152,21 +178,9 @@ def _parse_file(self, file_info: dict[str, Any]) -> str: Returns: Parsed text content """ - if not self.parser: - # Try to create a default parser - try: - from memos.configs.parser import ParserConfigFactory - - parser_config = ParserConfigFactory.model_validate( - { - "backend": "markitdown", - "config": {}, - } - ) - self.parser = ParserFactory.from_config(parser_config) - except Exception as e: - logger.warning(f"[FileContentParser] Failed to create parser: {e}") - return "" + if not file_parser: + logger.warning("[FileContentParser] Parser not available") + return "" file_path = file_info.get("path") or file_info.get("file_id", "") filename = file_info.get("filename", "unknown") @@ -177,7 +191,7 @@ def _parse_file(self, file_info: dict[str, Any]) -> str: try: if os.path.exists(file_path): - parsed_text = self.parser.parse(file_path) + parsed_text = file_parser.parse(file_path) return parsed_text else: logger.warning(f"[FileContentParser] File not found: {file_path}") @@ -264,6 +278,9 @@ def parse_fast( # Combine content parts content = " ".join(content_parts) + # Split content into chunks + content_chunks = self._split_text(content) + # Create source source = self.create_source(message, info) @@ -276,27 +293,59 @@ def parse_fast( # (since we don't have role information at this level) memory_type = "LongTermMemory" - # Create memory item - memory_item = TextualMemoryItem( - memory=content, - metadata=TreeNodeTextualMemoryMetadata( - user_id=user_id, - session_id=session_id, - memory_type=memory_type, - status="activated", - tags=["mode:fast", "multimodal:file"], - key=_derive_key(content), - embedding=self.embedder.embed([content])[0], - usage=[], - sources=[source], - background="", - confidence=0.99, - type="fact", - info=info_, - ), - ) + # Create memory items for each chunk + memory_items = [] + for chunk_idx, chunk_text in enumerate(content_chunks): + if not chunk_text.strip(): + continue + + memory_item = TextualMemoryItem( + memory=chunk_text, + metadata=TreeNodeTextualMemoryMetadata( + user_id=user_id, + session_id=session_id, + memory_type=memory_type, + status="activated", + tags=[ + "mode:fast", + "multimodal:file", + f"chunk:{chunk_idx + 1}/{len(content_chunks)}", + ], + key=_derive_key(chunk_text), + embedding=self.embedder.embed([chunk_text])[0], + usage=[], + sources=[source], + background="", + confidence=0.99, + type="fact", + info=info_, + ), + ) + memory_items.append(memory_item) + + # If no chunks were created, create a placeholder + if not memory_items: + memory_item = TextualMemoryItem( + memory=content, + metadata=TreeNodeTextualMemoryMetadata( + user_id=user_id, + session_id=session_id, + memory_type=memory_type, + status="activated", + tags=["mode:fast", "multimodal:file"], + key=_derive_key(content), + embedding=self.embedder.embed([content])[0], + usage=[], + sources=[source], + background="", + confidence=0.99, + type="fact", + info=info_, + ), + ) + memory_items.append(memory_item) - return [memory_item] + return memory_items def parse_fine( self, @@ -326,22 +375,9 @@ def parse_fine( file_data = file_info.get("file_data", "") file_id = file_info.get("file_id", "") filename = file_info.get("filename", "") - - # Initialize parser if not already set - if not self.parser: - try: - from memos.configs.parser import ParserConfigFactory - - parser_config = ParserConfigFactory.model_validate( - { - "backend": "markitdown", - "config": {}, - } - ) - self.parser = ParserFactory.from_config(parser_config) - except Exception as e: - logger.warning(f"[FileContentParser] Failed to create parser: {e}") - return [] + if not file_parser: + logger.warning("[FileContentParser] Parser not available") + return [] parsed_text = "" temp_file_path = None @@ -356,7 +392,12 @@ def parse_fine( parsed_text, temp_file_path = self._handle_url(url_str, filename) if temp_file_path: try: - parsed_text = self.parser.parse(temp_file_path) + # Use parser from utils (singleton) + parser = self.parser or file_parser + if parser: + parsed_text = parser.parse(temp_file_path) + else: + parsed_text = "[File parsing error: Parser not available]" except Exception as e: logger.error( f"[FileContentParser] Error parsing downloaded file: {e}" @@ -411,24 +452,59 @@ def parse_fine( # For file content parts, default to LongTermMemory memory_type = "LongTermMemory" - # Create memory item with parsed content - memory_item = TextualMemoryItem( - memory=parsed_text, - metadata=TreeNodeTextualMemoryMetadata( - user_id=user_id, - session_id=session_id, - memory_type=memory_type, - status="activated", - tags=["mode:fine", "multimodal:file"], - key=_derive_key(parsed_text), - embedding=self.embedder.embed([parsed_text])[0], - usage=[], - sources=[source], - background="", - confidence=0.99, - type="fact", - info=info_, - ), - ) + # Split parsed text into chunks + content_chunks = self._split_text(parsed_text) + + # Create memory items for each chunk + memory_items = [] + for chunk_idx, chunk_text in enumerate(content_chunks): + if not chunk_text.strip(): + continue + + memory_item = TextualMemoryItem( + memory=chunk_text, + metadata=TreeNodeTextualMemoryMetadata( + user_id=user_id, + session_id=session_id, + memory_type=memory_type, + status="activated", + tags=[ + "mode:fine", + "multimodal:file", + f"chunk:{chunk_idx + 1}/{len(content_chunks)}", + ], + key=_derive_key(chunk_text), + embedding=self.embedder.embed([chunk_text])[0], + usage=[], + sources=[source], + background="", + confidence=0.99, + type="fact", + info=info_, + ), + ) + memory_items.append(memory_item) + + # If no chunks were created, create a placeholder + if not memory_items: + memory_item = TextualMemoryItem( + memory=parsed_text, + metadata=TreeNodeTextualMemoryMetadata( + user_id=user_id, + session_id=session_id, + memory_type=memory_type, + status="activated", + tags=["mode:fine", "multimodal:file"], + key=_derive_key(parsed_text), + embedding=self.embedder.embed([parsed_text])[0], + usage=[], + sources=[source], + background="", + confidence=0.99, + type="fact", + info=info_, + ), + ) + memory_items.append(memory_item) - return [memory_item] + return memory_items diff --git a/src/memos/mem_reader/read_multi_modal/utils.py b/src/memos/mem_reader/read_multi_modal/utils.py index bb2e77e38..30b030f64 100644 --- a/src/memos/mem_reader/read_multi_modal/utils.py +++ b/src/memos/mem_reader/read_multi_modal/utils.py @@ -43,6 +43,58 @@ re.I, ) +# Default configuration for parser and text splitter +DEFAULT_PARSER_CONFIG = { + "backend": "markitdown", + "config": {}, +} + +DEFAULT_CHUNK_SIZE = int(os.getenv("FILE_PARSER_CHUNK_SIZE", "1000")) +DEFAULT_CHUNK_OVERLAP = int(os.getenv("FILE_PARSER_CHUNK_OVERLAP", "200")) + +# Initialize parser instance +file_parser = None +try: + parser_config = ParserConfigFactory.model_validate(DEFAULT_PARSER_CONFIG) + file_parser = ParserFactory.from_config(parser_config) + logger.debug("[FileContentParser] Initialized parser instance") +except Exception as e: + logger.error(f"[FileContentParser] Failed to create parser: {e}") + file_parser = None + +# Initialize text splitter instance +text_splitter = None +try: + try: + from langchain.text_splitter import RecursiveCharacterTextSplitter + except ImportError: + try: + from langchain_text_splitters import RecursiveCharacterTextSplitter + except ImportError: + logger.error( + "langchain not available. Install with: pip install langchain or pip install langchain-text-splitters" + ) + + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=DEFAULT_CHUNK_SIZE, + chunk_overlap=DEFAULT_CHUNK_OVERLAP, + length_function=len, + separators=["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " ", ""], + ) + logger.debug( + f"[FileContentParser] Initialized text splitter with chunk_size={DEFAULT_CHUNK_SIZE}, " + f"chunk_overlap={DEFAULT_CHUNK_OVERLAP}" + ) +except ImportError as e: + logger.warning( + f"[FileContentParser] langchain not available, text splitting will be disabled: {e}. " + "Install with: pip install langchain or pip install langchain-text-splitters" + ) + text_splitter = None +except Exception as e: + logger.error(f"[FileContentParser] Failed to initialize text splitter: {e}") + text_splitter = None + def extract_role(message: dict[str, Any]) -> str: """Extract role from message.""" From 4cef2c3e20633ff3dca6fe4e8467736d08e9d3ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B8=AD=E9=98=B3=E9=98=B3?= Date: Tue, 2 Dec 2025 17:33:21 +0800 Subject: [PATCH 7/7] feat: add default spliter --- .../read_multi_modal/file_content_parser.py | 22 +++-- .../mem_reader/read_multi_modal/utils.py | 99 ++++++++++++++++++- 2 files changed, 109 insertions(+), 12 deletions(-) diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 81b5a14c0..8a08d6a93 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -16,7 +16,7 @@ from memos.types.openai_chat_completion_types import File from .base import BaseMessageParser, _derive_key -from .utils import file_parser, text_splitter +from .utils import get_parser, get_text_splitter logger = get_logger(__name__) @@ -110,7 +110,7 @@ def __init__( def _split_text(self, text: str) -> list[str]: """ - Split text into chunks using langchain text splitter from utils. + Split text into chunks using text splitter from utils. Args: text: Text to split @@ -121,12 +121,13 @@ def _split_text(self, text: str) -> list[str]: if not text or not text.strip(): return [] - if not text_splitter: + splitter = get_text_splitter() + if not splitter: # If text splitter is not available, return text as single chunk return [text] if text.strip() else [] try: - chunks = text_splitter.split_text(text) + chunks = splitter.split_text(text) logger.debug(f"[FileContentParser] Split text into {len(chunks)} chunks") return chunks except Exception as e: @@ -178,7 +179,8 @@ def _parse_file(self, file_info: dict[str, Any]) -> str: Returns: Parsed text content """ - if not file_parser: + parser = self.parser or get_parser() + if not parser: logger.warning("[FileContentParser] Parser not available") return "" @@ -191,7 +193,7 @@ def _parse_file(self, file_info: dict[str, Any]) -> str: try: if os.path.exists(file_path): - parsed_text = file_parser.parse(file_path) + parsed_text = parser.parse(file_path) return parsed_text else: logger.warning(f"[FileContentParser] File not found: {file_path}") @@ -375,7 +377,10 @@ def parse_fine( file_data = file_info.get("file_data", "") file_id = file_info.get("file_id", "") filename = file_info.get("filename", "") - if not file_parser: + + # Use parser from utils + parser = self.parser or get_parser() + if not parser: logger.warning("[FileContentParser] Parser not available") return [] @@ -392,8 +397,7 @@ def parse_fine( parsed_text, temp_file_path = self._handle_url(url_str, filename) if temp_file_path: try: - # Use parser from utils (singleton) - parser = self.parser or file_parser + # Use parser from utils if parser: parsed_text = parser.parse(temp_file_path) else: diff --git a/src/memos/mem_reader/read_multi_modal/utils.py b/src/memos/mem_reader/read_multi_modal/utils.py index 30b030f64..992011765 100644 --- a/src/memos/mem_reader/read_multi_modal/utils.py +++ b/src/memos/mem_reader/read_multi_modal/utils.py @@ -52,6 +52,49 @@ DEFAULT_CHUNK_SIZE = int(os.getenv("FILE_PARSER_CHUNK_SIZE", "1000")) DEFAULT_CHUNK_OVERLAP = int(os.getenv("FILE_PARSER_CHUNK_OVERLAP", "200")) + +def _simple_split_text(text: str, chunk_size: int, chunk_overlap: int) -> list[str]: + """ + Simple text splitter as fallback when langchain is not available. + + Args: + text: Text to split + chunk_size: Maximum size of chunks + chunk_overlap: Overlap between chunks + + Returns: + List of text chunks + """ + if not text or len(text) <= chunk_size: + return [text] if text.strip() else [] + + chunks = [] + start = 0 + text_len = len(text) + + while start < text_len: + # Calculate end position + end = min(start + chunk_size, text_len) + + # If not the last chunk, try to break at a good position + if end < text_len: + # Try to break at newline, sentence end, or space + for separator in ["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " "]: + last_sep = text.rfind(separator, start, end) + if last_sep != -1: + end = last_sep + len(separator) + break + + chunk = text[start:end].strip() + if chunk: + chunks.append(chunk) + + # Move start position with overlap + start = max(start + 1, end - chunk_overlap) + + return chunks + + # Initialize parser instance file_parser = None try: @@ -64,6 +107,8 @@ # Initialize text splitter instance text_splitter = None +_use_simple_splitter = False + try: try: from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -82,18 +127,66 @@ separators=["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " ", ""], ) logger.debug( - f"[FileContentParser] Initialized text splitter with chunk_size={DEFAULT_CHUNK_SIZE}, " + f"[FileContentParser] Initialized langchain text splitter with chunk_size={DEFAULT_CHUNK_SIZE}, " f"chunk_overlap={DEFAULT_CHUNK_OVERLAP}" ) except ImportError as e: logger.warning( - f"[FileContentParser] langchain not available, text splitting will be disabled: {e}. " + f"[FileContentParser] langchain not available, using simple text splitter as fallback: {e}. " "Install with: pip install langchain or pip install langchain-text-splitters" ) text_splitter = None + _use_simple_splitter = True except Exception as e: - logger.error(f"[FileContentParser] Failed to initialize text splitter: {e}") + logger.error( + f"[FileContentParser] Failed to initialize text splitter: {e}, using simple splitter as fallback" + ) text_splitter = None + _use_simple_splitter = True + + +def get_parser() -> Any: + """ + Get parser instance. + + Returns: + Parser instance (from ParserFactory) or None if not available + """ + return file_parser + + +def get_text_splitter(chunk_size: int | None = None, chunk_overlap: int | None = None) -> Any: + """ + Get text splitter instance or a callable that uses simple splitter. + + Args: + chunk_size: Maximum size of chunks when splitting text (used for simple splitter fallback) + chunk_overlap: Overlap between chunks when splitting text (used for simple splitter fallback) + + Returns: + Text splitter instance (RecursiveCharacterTextSplitter) or a callable wrapper for simple splitter + """ + if text_splitter is not None: + return text_splitter + + # Return a callable wrapper that uses simple splitter + if _use_simple_splitter: + actual_chunk_size = chunk_size or DEFAULT_CHUNK_SIZE + actual_chunk_overlap = chunk_overlap or DEFAULT_CHUNK_OVERLAP + + class SimpleTextSplitter: + """Simple text splitter wrapper.""" + + def __init__(self, chunk_size: int, chunk_overlap: int): + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + + def split_text(self, text: str) -> list[str]: + return _simple_split_text(text, self.chunk_size, self.chunk_overlap) + + return SimpleTextSplitter(actual_chunk_size, actual_chunk_overlap) + + return None def extract_role(message: dict[str, Any]) -> str: