diff --git a/examples/mem_reader/multimodal_struct_reader.py b/examples/mem_reader/multimodal_struct_reader.py index d132a4170..be9721e21 100644 --- a/examples/mem_reader/multimodal_struct_reader.py +++ b/examples/mem_reader/multimodal_struct_reader.py @@ -1,109 +1,551 @@ +#!/usr/bin/env python3 +""" +MultiModalStructMemReader Example Script + +This script demonstrates various use cases for MultiModalStructMemReader, +including different message types, modes (fast/fine), and output formats. + +Usage: + python multimodal_struct_reader.py --example all + python multimodal_struct_reader.py --example string_message --mode fast + python multimodal_struct_reader.py --example multimodal --format json +""" + import argparse import json import os +import sys import time +from pathlib import Path from typing import Any from dotenv import load_dotenv from memos.configs.mem_reader import MultiModalStructMemReaderConfig from memos.mem_reader.multi_modal_struct import MultiModalStructMemReader -from memos.memories.textual.item import ( - SourceMessage, - TextualMemoryItem, - TreeNodeTextualMemoryMetadata, -) +from memos.memories.textual.item import TextualMemoryItem -# Load environment variables from .env file +# Add src directory to path +src_path = Path(__file__).parent.parent.parent / "src" +sys.path.insert(0, str(src_path)) + +# Load environment variables load_dotenv() -def print_textual_memory_item( - item: TextualMemoryItem, max_memory_length: int = 200, indent: int = 0 -): - """ - Print a TextualMemoryItem in a structured format. +# ============================================================================ +# Test Case Definitions +# ============================================================================ + + +class TestCase: + """Base class for test cases.""" + + def __init__( + self, + name: str, + description: str, + scene_data: Any, + expected_count: dict[str, int] | None = None, + ): + """ + Initialize a test case. + + Args: + name: Test case name + description: Test case description + scene_data: Scene data to test + expected_count: Expected memory count for each mode (optional) + """ + self.name = name + self.description = description + self.scene_data = scene_data + self.expected_count = expected_count or {} + + def get_info(self) -> dict[str, Any]: + """Get info dict for this test case.""" + return { + "user_id": "test_user", + "session_id": f"session_{self.name}", + "test_case": self.name, + } - Args: - item: The TextualMemoryItem to print - max_memory_length: Maximum length of memory content to display - indent: Number of spaces for indentation - """ - indent_str = " " * indent - print(f"{indent_str}{'=' * 80}") - print(f"{indent_str}TextualMemoryItem") - print(f"{indent_str}{'=' * 80}") - print(f"{indent_str}ID: {item.id}") - print( - f"{indent_str}Memory: {item.memory[:max_memory_length]}{'...' if len(item.memory) > max_memory_length else ''}" + +# String message test cases +STRING_MESSAGE_CASES = [ + TestCase( + name="string_simple", + description="Simple string message", + scene_data=["今天心情不错,喝了咖啡。"], + expected_count={"fast": 1, "fine": 1}, # StringParser returns [] in + # fast mode + ), + TestCase( + name="string_multiple", + description="Multiple string messages", + scene_data=[ + "这是第一条消息。", + "这是第二条消息。", + "这是第三条消息。", + ], + ), +] + +# Standard chat message test cases +CHAT_MESSAGE_CASES = [ + TestCase( + name="chat_simple", + description="Simple chat conversation", + scene_data=[ + [ + { + "role": "user", + "content": "Hello, how are you?", + "chat_time": "2025-01-01T10:00:00Z", + }, + { + "role": "assistant", + "content": "I'm doing well, thank you!", + "chat_time": "2025-01-01T10:00:01Z", + }, + ] + ], + ), + TestCase( + name="chat_with_system", + description="Chat with system message", + scene_data=[ + [ + { + "role": "system", + "content": [{"type": "text", "text": "You are a helpful assistant."}], + "chat_time": "2025-01-01T10:00:00Z", + }, + { + "role": "user", + "content": "What's the weather?", + "chat_time": "2025-01-01T10:00:01Z", + }, + { + "role": "assistant", + "content": "I don't have access to weather data.", + "chat_time": "2025-01-01T10:00:02Z", + }, + ] + ], + ), + TestCase( + name="chat_long_conversation", + description="Long conversation with multiple turns", + scene_data=[ + [ + { + "role": "user", + "chat_time": "3 May 2025", + "content": "I'm feeling a bit down today.", + }, + { + "role": "assistant", + "chat_time": "3 May 2025", + "content": "I'm sorry to hear that. Do you want to talk about what's been going on?", + }, + { + "role": "user", + "chat_time": "3 May 2025", + "content": "It's just been a tough couple of days.", + }, + { + "role": "assistant", + "chat_time": "3 May 2025", + "content": "It sounds like you're going through a lot right now.", + }, + ] + ], + ), +] + +# Tool-related test cases +TOOL_MESSAGE_CASES = [ + TestCase( + name="tool_assistant_with_calls", + description="Assistant message with tool_calls", + scene_data=[ + [ + { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "tool-call-weather-1", + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"location": "北京"}', + }, + } + ], + "chat_time": "2025-11-24T10:12:00Z", + "message_id": "assistant-with-call-1", + } + ] + ], + ), + TestCase( + name="tool_with_result", + description="Tool call with result message", + scene_data=[ + [ + { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "tool-call-weather-1", + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"location": "北京"}', + }, + } + ], + "chat_time": "2025-11-24T10:12:00Z", + }, + { + "role": "tool", + "content": "北京今天天气晴朗,温度25°C,湿度60%。", + "tool_call_id": "tool-call-weather-1", + "chat_time": "2025-11-24T10:12:05Z", + }, + ] + ], + ), + TestCase( + name="tool_custom_format", + description="Custom tool format (tool_description, tool_input, tool_output)", + scene_data=[ + [ + { + "type": "tool_description", + "name": "get_weather", + "description": "获取指定地点的当前天气信息", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string", "description": "城市名称"}}, + "required": ["location"], + }, + }, + { + "type": "tool_input", + "call_id": "call_123", + "name": "get_weather", + "argument": {"location": "北京"}, + }, + { + "type": "tool_output", + "call_id": "call_123", + "name": "get_weather", + "output": {"weather": "晴朗", "temperature": 25, "humidity": 60}, + }, + ] + ], + ), +] + +# Multimodal message test cases +MULTIMODAL_MESSAGE_CASES = [ + TestCase( + name="multimodal_text_image", + description="User message with text and image", + scene_data=[ + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "帮我看看这张图片大概是什么内容?"}, + { + "type": "image_url", + "image_url": { + "url": "https://example.com/mountain_lake.jpg", + "detail": "high", + }, + }, + ], + "chat_time": "2025-11-24T10:20:00Z", + "message_id": "mm-img-1", + } + ] + ], + ), + TestCase( + name="multimodal_text_file", + description="User message with text and file", + scene_data=[ + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "请阅读这个PDF,总结里面的要点。"}, + {"type": "file", "file": {"file_id": "file_123", "filename": "report.pdf"}}, + ], + "chat_time": "2025-11-24T10:21:00Z", + "message_id": "mm-file-1", + } + ] + ], + ), + TestCase( + name="multimodal_mixed", + description="Mixed multimodal message (text + file + image)", + scene_data=[ + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "请同时分析这个报告和图表。"}, + { + "type": "file", + "file": {"file_id": "file_789", "filename": "analysis_report.pdf"}, + }, + { + "type": "image_url", + "image_url": {"url": "https://example.com/chart.png", "detail": "auto"}, + }, + ], + "chat_time": "2025-11-24T10:23:00Z", + "message_id": "mixed-1", + } + ] + ], + ), + TestCase( + name="multimodal_audio", + description="Audio-only message", + scene_data=[ + [ + { + "role": "user", + "content": [ + { + "type": "input_audio", + "input_audio": {"data": "base64_encoded_audio_here", "format": "mp3"}, + } + ], + "chat_time": "2025-11-24T10:22:00Z", + "message_id": "audio-1", + } + ] + ], + ), +] + +# Raw input item test cases +RAW_INPUT_CASES = [ + TestCase( + name="raw_text_items", + description="Pure text input items without dialog context", + scene_data=[ + [ + {"type": "text", "text": "这是一段独立的文本输入,没有明确的对话上下文。"}, + {"type": "text", "text": "它依然会被抽取和写入明文记忆。"}, + ] + ], + ), + TestCase( + name="raw_file_item", + description="Pure file input by file_id", + scene_data=[ + [{"type": "file", "file": {"file_id": "file_uploaded_123", "filename": "document.pdf"}}] + ], + ), + # File parameter test cases - covering all combinations + TestCase( + name="file_only_file_id", + description="File with only file_id parameter", + scene_data=[[{"type": "file", "file": {"file_id": "file_only_id_123"}}]], + ), + TestCase( + name="file_only_filename", + description="File with only filename parameter", + scene_data=[[{"type": "file", "file": {"filename": "document_only.pdf"}}]], + ), + TestCase( + name="file_only_file_data_base64", + description="File with only file_data (base64 encoded)", + scene_data=[ + [ + { + "type": "file", + "file": { + "file_data": "data:application/pdf;base64,JVBERi0xLjQKJdPr6eEKMSAwIG9iago8PAovVHlwZSAvQ2F0YWxvZwovUGFnZXMgMiAwIFIKPj4KZW5kb2JqCjIgMCBvYmoKPDwKL1R5cGUgL1BhZ2VzCi9LaWRzIFszIDAgUl0KL0NvdW50IDEKPD4KZW5kb2JqCjMgMCBvYmoKPDwKL1R5cGUgL1BhZ2UKL1BhcmVudCAyIDAgUgovTWVkaWFCb3ggWzAgMCA2MTIgNzkyXQovUmVzb3VyY2VzIDw8Ci9Gb250IDw8Ci9GMSA0IDAgUgo+Pgo+PgovQ29udGVudHMgNSAwIFIKPj4KZW5kb2JqCjQgMCBvYmoKPDwKL1R5cGUgL0ZvbnQKL1N1YnR5cGUgL1R5cGUxCi9CYXNlRm9udCAvSGVsdmV0aWNhCj4+CmVuZG9iag==" + }, + } + ] + ], + ), + TestCase( + name="file_only_file_data_url", + description="File with only file_data (URL)", + scene_data=[ + [ + { + "type": "file", + "file": {"file_data": "https://example.com/documents/report.pdf"}, + } + ] + ], + ), + TestCase( + name="file_only_file_data_text", + description="File with only file_data (plain text content)", + scene_data=[ + [ + { + "type": "file", + "file": { + "file_data": "This is a plain text file content. It contains multiple lines.\nLine 2 of the file.\nLine 3 of the file." + }, + } + ] + ], + ), + TestCase( + name="file_file_data_and_file_id", + description="File with file_data and file_id", + scene_data=[ + [ + { + "type": "file", + "file": { + "file_data": "https://example.com/documents/data.pdf", + "file_id": "file_with_data_123", + }, + } + ] + ], + ), + TestCase( + name="file_file_data_and_filename", + description="File with file_data and filename", + scene_data=[ + [ + { + "type": "file", + "file": { + "file_data": "This is file content with filename.", + "filename": "content_with_name.txt", + }, + } + ] + ], + ), + TestCase( + name="file_file_id_and_filename", + description="File with file_id and filename (existing case)", + scene_data=[ + [{"type": "file", "file": {"file_id": "file_uploaded_123", "filename": "document.pdf"}}] + ], + ), + TestCase( + name="file_all_parameters", + description="File with all parameters (file_data, file_id, filename)", + scene_data=[ + [ + { + "type": "file", + "file": { + "file_data": "https://example.com/documents/complete.pdf", + "file_id": "file_complete_123", + "filename": "complete_document.pdf", + }, + } + ] + ], + ), + TestCase( + name="file_no_parameters", + description="File with no parameters (should return [File: unknown])", + scene_data=[[{"type": "file", "file": {}}]], + ), +] + +# Assistant message test cases +ASSISTANT_MESSAGE_CASES = [ + TestCase( + name="assistant_with_refusal", + description="Assistant message with refusal", + scene_data=[ + [ + { + "role": "assistant", + "content": [{"type": "text", "text": "I can help you with that."}], + "refusal": "I cannot provide information about that topic.", + "chat_time": "2025-11-24T10:30:00Z", + } + ] + ], + ), + TestCase( + name="assistant_with_audio", + description="Assistant message with audio", + scene_data=[ + [ + { + "role": "assistant", + "content": "Here's the audio response.", + "audio": {"id": "audio_response_123"}, + "chat_time": "2025-11-24T10:31:00Z", + } + ] + ], + ), +] + +# All test cases organized by category +TEST_CASES = { + "string": STRING_MESSAGE_CASES, + "chat": CHAT_MESSAGE_CASES, + "tool": TOOL_MESSAGE_CASES, + "multimodal": MULTIMODAL_MESSAGE_CASES, + "raw": RAW_INPUT_CASES, + "assistant": ASSISTANT_MESSAGE_CASES, +} + +# Flattened list of all test cases +ALL_TEST_CASES = {case.name: case for cases in TEST_CASES.values() for case in cases} + + +# ============================================================================ +# Utility Functions +# ============================================================================ + + +def print_textual_memory_item(item: TextualMemoryItem, prefix: str = "", max_length: int = 500): + """Print a memory item in a readable format.""" + print(f"{prefix}Memory ID: {item.id}") + print(f"{prefix}Memory Type: {item.metadata.memory_type}") + if item.metadata.tags: + print(f"{prefix}Tags: {item.metadata.tags}") + memory_preview = ( + item.memory[:max_length] + "..." if len(item.memory) > max_length else item.memory ) - print(f"{indent_str}Memory Length: {len(item.memory)} characters") - - # Print metadata - if hasattr(item.metadata, "user_id"): - print(f"{indent_str}User ID: {item.metadata.user_id}") - if hasattr(item.metadata, "session_id"): - print(f"{indent_str}Session ID: {item.metadata.session_id}") - if hasattr(item.metadata, "memory_type"): - print(f"{indent_str}Memory Type: {item.metadata.memory_type}") - if hasattr(item.metadata, "type"): - print(f"{indent_str}Type: {item.metadata.type}") - if hasattr(item.metadata, "key") and item.metadata.key: - print(f"{indent_str}Key: {item.metadata.key}") - if hasattr(item.metadata, "tags") and item.metadata.tags: - print(f"{indent_str}Tags: {', '.join(item.metadata.tags)}") - if hasattr(item.metadata, "confidence"): - print(f"{indent_str}Confidence: {item.metadata.confidence}") - if hasattr(item.metadata, "status"): - print(f"{indent_str}Status: {item.metadata.status}") - if hasattr(item.metadata, "background") and item.metadata.background: - bg_preview = ( - item.metadata.background[:100] + "..." - if len(item.metadata.background) > 100 - else item.metadata.background - ) - print(f"{indent_str}Background: {bg_preview}") - if hasattr(item.metadata, "sources") and item.metadata.sources: - print(f"{indent_str}Sources ({len(item.metadata.sources)}):") - for i, source in enumerate(item.metadata.sources): - source_info = [] - if hasattr(source, "type"): - source_info.append(f"type={source.type}") - if hasattr(source, "role"): - source_info.append(f"role={source.role}") - if hasattr(source, "doc_path"): - source_info.append(f"doc_path={source.doc_path}") - if hasattr(source, "chat_time"): - source_info.append(f"chat_time={source.chat_time}") - if hasattr(source, "index") and source.index is not None: - source_info.append(f"index={source.index}") - print(f"{indent_str} [{i + 1}] {', '.join(source_info)}") - if hasattr(item.metadata, "created_at"): - print(f"{indent_str}Created At: {item.metadata.created_at}") - if hasattr(item.metadata, "updated_at"): - print(f"{indent_str}Updated At: {item.metadata.updated_at}") - if hasattr(item.metadata, "embedding") and item.metadata.embedding: - print(f"{indent_str}Embedding: [vector of {len(item.metadata.embedding)} dimensions]") - print(f"{indent_str}{'=' * 80}\n") + print(f"{prefix}Memory: {memory_preview}") + if item.metadata.key: + print(f"{prefix}Key: {item.metadata.key}") + if item.metadata.sources: + sources_count = len(item.metadata.sources) if isinstance(item.metadata.sources, list) else 1 + print(f"{prefix}Sources count: {sources_count}") + print() def print_textual_memory_item_json(item: TextualMemoryItem, indent: int = 2): - """ - Print a TextualMemoryItem as formatted JSON. - - Args: - item: The TextualMemoryItem to print - indent: JSON indentation level - """ - # Convert to dict and exclude embedding for readability + """Print a memory item as formatted JSON.""" data = item.to_dict() if "metadata" in data and "embedding" in data["metadata"]: embedding = data["metadata"]["embedding"] if embedding: data["metadata"]["embedding"] = f"[vector of {len(embedding)} dimensions]" - print(json.dumps(data, indent=indent, ensure_ascii=False)) @@ -111,9 +553,6 @@ def get_reader_config() -> dict[str, Any]: """ Get reader configuration from environment variables. - Returns a dictionary that can be used to create MultiModalStructMemReaderConfig. - Similar to APIConfig.get_reader_config() in server_router_api.py. - Returns: Configuration dictionary for MultiModalStructMemReaderConfig """ @@ -205,626 +644,263 @@ def get_reader_config() -> dict[str, Any]: } +def count_memories(memory_results: list[list[TextualMemoryItem]]) -> int: + """Count total number of memory items across all scenes.""" + return sum(len(mem_list) for mem_list in memory_results) + + +# ============================================================================ +# Main Functions +# ============================================================================ + + +def run_test_case( + test_case: TestCase, reader: MultiModalStructMemReader, mode: str = "fast", format: str = "text" +): + """ + Run a single test case. + + Args: + test_case: Test case to run + reader: MultiModalStructMemReader instance + mode: Processing mode ("fast" or "fine") + format: Output format ("text" or "json") + """ + print(f"\n{'=' * 80}") + print(f"Test Case: {test_case.name}") + print(f"Description: {test_case.description}") + print(f"Mode: {mode.upper()}") + print(f"{'=' * 80}\n") + + info = test_case.get_info() + start_time = time.time() + + try: + memory_results = reader.get_memory(test_case.scene_data, type="chat", info=info, mode=mode) + elapsed_time = time.time() - start_time + + total_count = count_memories(memory_results) + print(f"✅ Completed in {elapsed_time:.2f}s") + print(f"📊 Generated {total_count} memory items across {len(memory_results)} scenes\n") + + # Check expected count if provided + if test_case.expected_count and mode in test_case.expected_count: + expected = test_case.expected_count[mode] + if total_count == expected: + print(f"✅ Expected count matches: {expected}") + else: + print(f"⚠️ Expected {expected}, got {total_count}") + + # Print sample results + print("\nSample Results:") + print("-" * 80) + for scene_idx, mem_list in enumerate(memory_results[:3]): # Show first 3 scenes + if not mem_list: + continue + print(f"\nScene {scene_idx + 1}:") + for item_idx, item in enumerate(mem_list[:2]): # Show first 2 items per scene + print(f"\n [Item {item_idx + 1}]") + if format == "json": + print_textual_memory_item_json(item, indent=4) + else: + print_textual_memory_item(item, prefix=" ", max_length=300) + + except Exception as e: + print(f"❌ Error: {e}") + import traceback + + traceback.print_exc() + + +def run_all_test_cases(reader: MultiModalStructMemReader, mode: str = "fast", format: str = "text"): + """Run all test cases.""" + print(f"\n{'=' * 80}") + print(f"Running All Test Cases (Mode: {mode.upper()})") + print(f"{'=' * 80}\n") + + total_cases = len(ALL_TEST_CASES) + for idx, (name, test_case) in enumerate(ALL_TEST_CASES.items(), 1): + print(f"\n[{idx}/{total_cases}] Running: {name}") + run_test_case(test_case, reader, mode=mode, format=format) + + +def run_category( + category: str, reader: MultiModalStructMemReader, mode: str = "fast", format: str = "text" +): + """Run all test cases in a category.""" + if category not in TEST_CASES: + print(f"❌ Unknown category: {category}") + print(f"Available categories: {', '.join(TEST_CASES.keys())}") + return + + cases = TEST_CASES[category] + print(f"\n{'=' * 80}") + print(f"Running Category: {category.upper()} ({len(cases)} test cases)") + print(f"Mode: {mode.upper()}") + print(f"{'=' * 80}\n") + + for idx, test_case in enumerate(cases, 1): + print(f"\n[{idx}/{len(cases)}] {test_case.name}") + run_test_case(test_case, reader, mode=mode, format=format) + + +def compare_modes(test_case: TestCase, reader: MultiModalStructMemReader, format: str = "text"): + """Compare fast and fine modes for a test case.""" + print(f"\n{'=' * 80}") + print(f"Comparing Fast vs Fine Mode: {test_case.name}") + print(f"{'=' * 80}\n") + + info = test_case.get_info() + + # Fast mode + print("⚡ FAST Mode:") + print("-" * 80) + start_time = time.time() + fast_results = reader.get_memory(test_case.scene_data, type="chat", info=info, mode="fast") + fast_time = time.time() - start_time + fast_count = count_memories(fast_results) + print(f"Time: {fast_time:.2f}s, Items: {fast_count}") + + # Fine mode + print("\n🔄 FINE Mode:") + print("-" * 80) + start_time = time.time() + fine_results = reader.get_memory(test_case.scene_data, type="chat", info=info, mode="fine") + fine_time = time.time() - start_time + fine_count = count_memories(fine_results) + print(f"Time: {fine_time:.2f}s, Items: {fine_count}") + + # Comparison + print("\n📈 Comparison:") + print(f" Fast: {fast_time:.2f}s, {fast_count} items") + print(f" Fine: {fine_time:.2f}s, {fine_count} items") + if fast_time > 0: + print(f" Speed: {fine_time / fast_time:.1f}x difference") + + # Show samples + if format == "text": + print("\n--- Fast Mode Sample (first item) ---") + if fast_results and fast_results[0]: + print_textual_memory_item(fast_results[0][0], prefix=" ", max_length=300) + + print("\n--- Fine Mode Sample (first item) ---") + if fine_results and fine_results[0]: + print_textual_memory_item(fine_results[0][0], prefix=" ", max_length=300) + + +def list_test_cases(): + """List all available test cases.""" + print("\n" + "=" * 80) + print("Available Test Cases") + print("=" * 80 + "\n") + + for category, cases in TEST_CASES.items(): + print(f"📁 {category.upper()} ({len(cases)} cases):") + for case in cases: + print(f" • {case.name}: {case.description}") + print() + + def main(): - # Parse command line arguments - parser = argparse.ArgumentParser(description="Test Mem-Reader with structured output") + """Main entry point.""" + parser = argparse.ArgumentParser( + description="Test MultiModalStructMemReader with various use cases", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Run all test cases in fast mode + python multimodal_struct_reader.py --example all --mode fast + + # Run a specific test case + python multimodal_struct_reader.py --example chat_simple --mode fine + + # Run a category of test cases + python multimodal_struct_reader.py --example multimodal --mode fast + + # Compare fast vs fine mode + python multimodal_struct_reader.py --example chat_simple --compare + + # List all available test cases + python multimodal_struct_reader.py --list + + # Output in JSON format + python multimodal_struct_reader.py --example chat_simple --format json + """, + ) + + parser.add_argument( + "--example", + type=str, + default="all", + help="Test case name, category name, or 'all' to run all cases (default: all)", + ) + parser.add_argument( + "--mode", + choices=["fast", "fine"], + default="fast", + help="Processing mode: fast (quick) or fine (with LLM) (default: fast)", + ) parser.add_argument( "--format", choices=["text", "json"], default="text", - help="Output format: 'text' for structured text, 'json' for JSON format (default: text)", + help="Output format: text (readable) or json (structured) (default: text)", + ) + parser.add_argument( + "--compare", + action="store_true", + help="Compare fast and fine modes (only works with specific test case)", + ) + parser.add_argument( + "--list", + action="store_true", + help="List all available test cases and exit", ) parser.add_argument( "--max-memory-length", type=int, - default=200, - help="Maximum length of memory content to display in text format (default: 200)", + default=500, + help="Maximum length of memory content to display (default: 500)", ) - args = parser.parse_args() - # 1. Create Configuration from environment variables or JSON file - # Try to get config from environment variables first - openai_api_key = os.getenv("OPENAI_API_KEY") - if openai_api_key: - # Use environment variables (similar to server_router_api.py) - config_dict = get_reader_config() - reader_config = MultiModalStructMemReaderConfig.model_validate(config_dict) - else: - # Fall back to JSON file - reader_config = MultiModalStructMemReaderConfig.from_json_file( - "examples/data/config/simple_struct_reader_config.json" - ) - reader = MultiModalStructMemReader(reader_config) - - # 2. Define scene data - scene_data = [ - [ - {"role": "user", "chat_time": "3 May 2025", "content": "I'm feeling a bit down today."}, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "I'm sorry to hear that. Do you want to talk about what's been going on?", - }, - { - "role": "user", - "chat_time": "3 May 2025", - "content": "It's just been a tough couple of days, you know? Everything feels a bit overwhelming, and I just can't seem to shake it off.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "It sounds like you're going through a lot right now. Sometimes it helps to talk things out. Is there something specific that's been weighing on you, or is it more of a general feeling?", - }, - { - "role": "user", - "chat_time": "3 May 2025", - "content": "It's a mix, I guess. Work's been really stressful, and on top of that, I've been feeling kinda disconnected from the people around me.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "That can be really draining, especially when you're feeling isolated on top of the stress. Do you think there's something from your past that's contributing to how you're feeling now? Sometimes our emotions are tied to older experiences.", - }, - { - "role": "user", - "chat_time": "3 May 2025", - "content": "Yeah, now that you mention it… I've been thinking a lot about my childhood lately. I didn't have the easiest upbringing. My parents were always busy, and I often felt like I had to figure things out on my own. I remember one winter, I got really sick with a high fever. My parents were both working, and I didn't want to bother them. I just lay in bed, crying by myself, not knowing what to do.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "That must have been very lonely. Growing up without that support can really make you feel like you have to handle everything on your own. How do you think that experience has affected you now?", - }, - { - "role": "user", - "chat_time": "3 May 2025", - "content": "Yeah, it was really lonely. I remember spending a lot of time doing things on my own, like drawing, because it was my only way to escape. I used to draw animals, and I created my own little world on paper. It was peaceful in a way, just me and my imagination, with no outside distractions.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "Drawing sounds like a beautiful way to express yourself and find some peace in that solitude. It must have been a very quiet and private world for you. Do you think that sense of loneliness from back then has shaped how you connect with others today?", - }, - { - "role": "user", - "chat_time": "3 May 2025", - "content": "I think it has. I've always found it hard to let others in. I feel like I have to handle everything on my own. If I rely on someone else, I feel vulnerable, and I start to feel that loneliness again.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "It sounds like this was a protective mechanism that helped you cope with your feelings of loneliness back then. But as you've grown, it seems like this pattern might not be serving you as well anymore. Have you thought about trying to break this cycle, or finding new ways to connect with others?", - }, - { - "role": "user", - "chat_time": "3 May 2025", - "content": "I've thought about it, though it's hard. But I also know I can't keep going like this forever. Maybe it's time to seek some help.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "It's a big step, but a positive one. Exploring your emotions and past experiences could really help you understand how they've shaped your present. Therapy could be a great place to start, as it would help you connect with both yourself and others. Would you be open to trying it?", - }, - { - "role": "user", - "chat_time": "3 May 2025", - "content": "I'm open to trying. Thank you for encouraging me. It feels like it might be a good starting point for me.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "I'm really glad to hear that. Taking that first step is always difficult, but you've made an important decision. If you need help finding the right therapist, or just want to talk, I'm here for you.", - }, - ], - [ - { - "role": "user", - "chat_time": "3 May 2025", - "content": "This week I've been feeling a bit off, especially when playing tennis. My body just doesn't feel right.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "It sounds like you've been having some physical discomfort lately. Where specifically are you feeling unwell? Is there something particular about tennis that's been difficult?", - }, - { - "role": "user", - "chat_time": "3 May 2025", - "content": "I just feel a bit fatigued, and every time I play tennis, my movements feel off. I used to play pretty well, but lately, I've been feeling weak and my movements aren't as coordinated as they used to be.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "Physical discomfort can definitely affect performance, especially in a sport like tennis that requires quick reactions and coordination. Have you noticed anything specific that might be causing these changes? Could it be overtraining, or is there another physical issue making you feel off?", - }, - { - "role": "user", - "chat_time": "3 May 2025", - "content": "I think it might be due to stress and lack of sleep recently. I just feel drained all the time. Plus, I've been frustrated with my tennis performance, which makes me feel even worse, like I'm stuck in a vicious cycle.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "Stress and poor sleep quality can definitely affect your physical performance, especially in sports. When you're also feeling down about your performance, it's easy to fall into a negative cycle. Have you tried anything to relieve some of the stress, or to improve your sleep?", - }, - { - "role": "user", - "chat_time": "3 May 2025", - "content": "I've tried some relaxation techniques, like deep breathing and simple meditation, but it doesn't seem to help much. I still can't focus during tennis, and I feel like my mind is distracted by other things.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "Relaxation techniques like deep breathing and meditation can help reduce stress, but sometimes we need a more structured approach to improve both our physical and mental state. For example, you might want to focus on specific aspects of your tennis training, or adjust your rest and recovery time. Have you thought about setting smaller goals for yourself? Like focusing on a specific tennis move each day, rather than expecting perfection right away? That might help you gradually regain confidence.", - }, - { - "role": "user", - "chat_time": "3 May 2025", - "content": "That's a good idea. I think I've been expecting too much from myself, which just makes me more anxious. Maybe I can start with small practice sessions and gradually find my rhythm again.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "Exactly! Setting small and clear goals will help reduce pressure and give you a sense of accomplishment. And don't forget that rest is just as important—taking time to recover will help you feel better physically and mentally. Both physical and mental recovery are key to improving your performance and health.", - }, - { - "role": "user", - "chat_time": "3 May 2025", - "content": "Thanks, I feel like I need to rethink a few things. Maybe I really need to be kinder to myself and take things step by step.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "You're on the right track. Rethinking things and giving yourself space and patience is so important. You've already taken the first step, now just keep moving forward, one step at a time. If you need anything, I'm always here to help.", - }, - ], - [ - { - "role": "user", - "chat_time": "3 May 2025", - "content": "Yesterday at 3 PM I had a meeting with my team about the new project. We discussed the timeline and I'm concerned we won't meet the deadline of December 15th.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "I understand your concern about the project timeline. What specific challenges do you think might prevent your team from meeting the December 15th deadline?", - }, - { - "role": "user", - "chat_time": "3 May 2025", - "content": "We have too many requirements to implement in just two weeks. The backend team needs at least until December 10th to finish their part, which leaves almost no time for testing before the deadline.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "That's definitely a tight timeline. Have you considered prioritizing the requirements to focus on the most critical features first? You might also want to discuss with stakeholders about possibly extending the deadline or reducing the scope for this initial release.", - }, - { - "role": "user", - "chat_time": "3 May 2025", - "content": "That's a good point. We have a follow-up meeting scheduled for tomorrow morning at 9:30 AM. I'll suggest we prioritize features and maybe propose pushing the deadline to January 5th next year.", - }, - { - "role": "assistant", - "chat_time": "3 May 2025", - "content": "That sounds like a solid approach. Coming to the meeting with both a prioritization strategy and a specific alternative deadline shows you've thought through the problem carefully. It might also help to bring some data about similar past projects and how long they took to complete, if you have that information available.", - }, - ], - ] + args = parser.parse_args() - print("=== Mem-Reader Fast vs Fine Mode Comparison ===\n") - - # 3. Test Fine Mode (default) - print("🔄 Testing FINE mode (default, with LLM processing)...") - start_time = time.time() - fine_memory = reader.get_memory( - scene_data, type="chat", info={"user_id": "user1", "session_id": "session1"}, mode="fine" - ) - fine_time = time.time() - start_time - print(f"✅ Fine mode completed in {fine_time:.2f} seconds") - print(f"📊 Fine mode generated {sum(len(mem_list) for mem_list in fine_memory)} memory items") - - # 4. Test Fast Mode - print("\n⚡ Testing FAST mode (quick processing, no LLM calls)...") - start_time = time.time() - fast_memory = reader.get_memory( - scene_data, type="chat", info={"user_id": "user1", "session_id": "session1"}, mode="fast" - ) - fast_time = time.time() - start_time - print(f"✅ Fast mode completed in {fast_time:.2f} seconds") - print(f"📊 Fast mode generated {sum(len(mem_list) for mem_list in fast_memory)} memory items") - - # 5. Performance Comparison - print("\n📈 Performance Comparison:") - print(f" Fine mode: {fine_time:.2f}s") - print(f" Fast mode: {fast_time:.2f}s") - print(f" Speed improvement: {fine_time / fast_time:.1f}x faster") - - # 6. Show sample results from both modes - print("\n🔍 Sample Results Comparison:") - print("\n--- FINE Mode Results (first 3 items) ---") - for i, mem_list in enumerate(fine_memory[:3]): - for j, mem_item in enumerate(mem_list[:2]): # Show first 2 items from each list - print(f"\n[Scene {i}][Item {j}]") - if args.format == "json": - print_textual_memory_item_json(mem_item, indent=2) - else: - print_textual_memory_item( - mem_item, max_memory_length=args.max_memory_length, indent=2 - ) - - print("\n--- FAST Mode Results (first 3 items) ---") - for i, mem_list in enumerate(fast_memory[:3]): - for j, mem_item in enumerate(mem_list[:2]): # Show first 2 items from each list - print(f"\n[Scene {i}][Item {j}]") - if args.format == "json": - print_textual_memory_item_json(mem_item, indent=2) - else: - print_textual_memory_item( - mem_item, max_memory_length=args.max_memory_length, indent=2 - ) - - # 7. Example of transfer fast mode result into fine result - fast_mode_memories = [ - TextualMemoryItem( - id="4553141b-3a33-4548-b779-e677ec797a9f", - memory="user: Nate:Oh cool! I might check that one out some time soon! I do love watching classics.\nassistant: Joanna:Yep, that movie is awesome. I first watched it around 3 years ago. I even went out and got a physical copy!\nuser: Nate:Sounds cool! Have you seen it a lot? sounds like you know the movie well!\nassistant: Joanna:A few times. It's one of my favorites! I really like the idea and the acting.\nuser: Nate:Cool! I'll definitely check it out. Thanks for the recommendation!\nassistant: Joanna:No problem, Nate! Let me know if you like it!\n", - metadata=TreeNodeTextualMemoryMetadata( - user_id="nate_test", - session_id="root_session", - status="activated", - type="fact", - key="user: Nate:Oh cool", - confidence=0.9900000095367432, - source=None, - tags=["mode:fast", "lang:en", "role:assistant", "role:user"], - visibility=None, - updated_at="2025-10-16T17:16:30.094877+08:00", - memory_type="LongTermMemory", - sources=[ - SourceMessage( - type="chat", - role="user", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=0, - ), - SourceMessage( - type="chat", - role="assistant", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=1, - ), - SourceMessage( - type="chat", - role="user", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=2, - ), - SourceMessage( - type="chat", - role="assistant", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=3, - ), - SourceMessage( - type="chat", - role="user", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=4, - ), - SourceMessage( - type="chat", - role="assistant", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=5, - ), - ], - embedding=None, - created_at="2025-10-16T17:16:30.094919+08:00", - usage=[], - background="", - ), - ), - TextualMemoryItem( - id="752e42fa-92b6-491a-a430-6864a7730fba", - memory="user: Nate:It was! How about you? Do you have any hobbies you love?\nassistant: Joanna:Yeah! Besides writing, I also enjoy reading, watching movies, and exploring nature. Anything else you enjoy doing, Nate?\nuser: Nate:Playing video games and watching movies are my main hobbies.\nassistant: Joanna:Cool, Nate! So we both have similar interests. What type of movies do you like best?\nuser: Nate:I love action and sci-fi movies, the effects are so cool! What about you, what's your favorite genre?\nassistant: Joanna:I'm all about dramas and romcoms. I love getting immersed in the feelings and plots.\nuser: Nate:Wow, movies can be so powerful! Do you have any recommendations for me?\nassistant: Joanna:Yeah, totally! Have you seen this romantic drama that's all about memory and relationships? It's such a good one.\nuser: Nate:Oh cool! I might check that one out some time soon! I do love watching classics.\nassistant: Joanna:Yep, that movie is awesome. I first watched it around 3 years ago. I even went out and got a physical copy!\n", - metadata=TreeNodeTextualMemoryMetadata( - user_id="nate_test", - session_id="root_session", - status="activated", - type="fact", - key="user: Nate:It was", - confidence=0.9900000095367432, - source=None, - tags=["mode:fast", "lang:en", "role:assistant", "role:user"], - visibility=None, - updated_at="2025-10-16T17:16:30.095726+08:00", - memory_type="LongTermMemory", - sources=[ - SourceMessage( - type="chat", - role="user", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=0, - ), - SourceMessage( - type="chat", - role="assistant", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=1, - ), - SourceMessage( - type="chat", - role="user", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=2, - ), - SourceMessage( - type="chat", - role="assistant", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=3, - ), - SourceMessage( - type="chat", - role="user", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=4, - ), - SourceMessage( - type="chat", - role="assistant", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=5, - ), - SourceMessage( - type="chat", - role="user", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=6, - ), - SourceMessage( - type="chat", - role="assistant", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=7, - ), - SourceMessage( - type="chat", - role="user", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=8, - ), - SourceMessage( - type="chat", - role="assistant", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=9, - ), - ], - embedding=None, - created_at="2025-10-16T17:16:30.095767+08:00", - usage=[], - background="", - ), - ), - TextualMemoryItem( - id="c9cf448c-deee-43a8-bafd-eb15fde535b2", - memory="user: Nate:Hey Joanna! Long time no see! What's up? Anything fun going on?\nassistant: Joanna:Hey Nate! Long time no see! I've been working on a project lately - it's been pretty cool. What about you - any fun projects or hobbies?\nuser: Nate:Hey Joanna! That's cool! I won my first video game tournament last week - so exciting!\nassistant: Joanna:Wow Nate! Congrats on winning! Tell me more - what game was it?\nuser: Nate:Thanks! it's a team shooter game.\nassistant: Joanna:Wow, great job! What was is called?\nuser: Nate:The game was called Counter-Strike: Global Offensive, and me and my team had a blast to the very end!\nassistant: Joanna:Cool, Nate! Sounds like a fun experience, even if I'm not into games.\nuser: Nate:It was! How about you? Do you have any hobbies you love?\nassistant: Joanna:Yeah! Besides writing, I also enjoy reading, watching movies, and exploring nature. Anything else you enjoy doing, Nate?\n", - metadata=TreeNodeTextualMemoryMetadata( - user_id="nate_test", - session_id="root_session", - status="activated", - type="fact", - key="user: Nate:Hey Joanna", - confidence=0.9900000095367432, - source=None, - tags=["mode:fast", "lang:en", "role:assistant", "role:user"], - visibility=None, - updated_at="2025-10-16T17:16:30.098208+08:00", - memory_type="LongTermMemory", - sources=[ - SourceMessage( - type="chat", - role="user", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=0, - ), - SourceMessage( - type="chat", - role="assistant", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=1, - ), - SourceMessage( - type="chat", - role="user", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=2, - ), - SourceMessage( - type="chat", - role="assistant", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=3, - ), - SourceMessage( - type="chat", - role="user", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=4, - ), - SourceMessage( - type="chat", - role="assistant", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=5, - ), - SourceMessage( - type="chat", - role="user", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=6, - ), - SourceMessage( - type="chat", - role="assistant", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=7, - ), - SourceMessage( - type="chat", - role="user", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=8, - ), - SourceMessage( - type="chat", - role="assistant", - chat_time="7:31 pm on 21 January, 2022", - message_id=None, - content=None, - doc_path=None, - index=9, - ), - ], - embedding=None, - created_at="2025-10-16T17:16:30.098246+08:00", - usage=[], - background="", - ), - ), - ] - fine_memories = reader.fine_transfer_simple_mem(fast_mode_memories, type="chat") - print("\n--- Transfer Mode Results (first 3 items) ---") - for i, mem_list in enumerate(fine_memories[:3]): - for j, mem_item in enumerate(mem_list[:2]): # Show first 2 items from each list - print(f"\n[Scene {i}][Item {j}]") - if args.format == "json": - print_textual_memory_item_json(mem_item, indent=2) - else: - print_textual_memory_item( - mem_item, max_memory_length=args.max_memory_length, indent=2 - ) - - # 7. Example of processing documents (only in fine mode) - print("\n=== Processing Documents (Fine Mode Only) ===") - # Example document paths (you should replace these with actual document paths) - doc_paths = [ - "text1.txt", - "text2.txt", - ] + # List test cases and exit + if args.list: + list_test_cases() + return + # Initialize reader + print("Initializing MultiModalStructMemReader...") try: - # 6. Acquiring memories from documents - doc_memory = reader.get_memory( - doc_paths, - "doc", - info={ - "user_id": "1111", - "session_id": "2222", - }, - mode="fine", - ) - total_items = sum(len(mem_list) for mem_list in doc_memory) - print(f"\n📄 Document Memory generated {total_items} items") - - # Print structured document memory items - if doc_memory: - print("\n--- Document Memory Items (first 3) ---") - for i, mem_list in enumerate(doc_memory[:3]): - for j, mem_item in enumerate(mem_list[:3]): # Show first 3 items from each document - print(f"\n[Document {i}][Item {j}]") - if args.format == "json": - print_textual_memory_item_json(mem_item, indent=2) - else: - print_textual_memory_item( - mem_item, max_memory_length=args.max_memory_length, indent=2 - ) + config_dict = get_reader_config() + reader_config = MultiModalStructMemReaderConfig.model_validate(config_dict) + reader = MultiModalStructMemReader(reader_config) + print("✅ Reader initialized\n") except Exception as e: - print(f"⚠️ Document processing failed: {e}") - print(" (This is expected if document files don't exist)") - - print("\n🎯 Summary:") - print(f" • Fast mode: {fast_time:.2f}s - Quick processing, no LLM calls") - print(f" • Fine mode: {fine_time:.2f}s - Full LLM processing for better understanding") - print(" • Use fast mode for: Real-time applications, high-throughput scenarios") - print(" • Use fine mode for: Quality analysis, detailed memory extraction") + print(f"❌ Failed to initialize reader: {e}") + import traceback + + traceback.print_exc() + return + + # Run test cases + if args.example == "all": + run_all_test_cases(reader, mode=args.mode, format=args.format) + elif args.example in ALL_TEST_CASES: + test_case = ALL_TEST_CASES[args.example] + if args.compare: + compare_modes(test_case, reader, format=args.format) + else: + run_test_case(test_case, reader, mode=args.mode, format=args.format) + elif args.example in TEST_CASES: + run_category(args.example, reader, mode=args.mode, format=args.format) + else: + print(f"❌ Unknown test case or category: {args.example}") + print("\nAvailable options:") + print(" Categories:", ", ".join(TEST_CASES.keys())) + print(" Test cases:", ", ".join(ALL_TEST_CASES.keys())) + print("\nUse --list to see all available test cases") if __name__ == "__main__": diff --git a/src/memos/api/config.py b/src/memos/api/config.py index 7710409d5..535811c42 100644 --- a/src/memos/api/config.py +++ b/src/memos/api/config.py @@ -426,7 +426,7 @@ def get_embedder_config() -> dict[str, Any]: def get_reader_config() -> dict[str, Any]: """Get reader configuration.""" return { - "backend": os.getenv("MEM_READER_BACKEND", "simple_struct"), + "backend": os.getenv("MEM_READER_BACKEND", "multimodal_struct"), "config": { "chunk_type": os.getenv("MEM_READER_CHAT_CHUNK_TYPE", "default"), "chunk_length": int(os.getenv("MEM_READER_CHAT_CHUNK_TOKEN_SIZE", 1600)), diff --git a/src/memos/mem_reader/read_multi_modal/__init__.py b/src/memos/mem_reader/read_multi_modal/__init__.py index 5659b4a6a..3ac074226 100644 --- a/src/memos/mem_reader/read_multi_modal/__init__.py +++ b/src/memos/mem_reader/read_multi_modal/__init__.py @@ -16,6 +16,7 @@ from .assistant_parser import AssistantParser from .base import BaseMessageParser from .file_content_parser import FileContentParser +from .image_parser import ImageParser from .multi_modal_parser import MultiModalParser from .string_parser import StringParser from .system_parser import SystemParser @@ -29,6 +30,7 @@ "AssistantParser", "BaseMessageParser", "FileContentParser", + "ImageParser", "MultiModalParser", "StringParser", "SystemParser", diff --git a/src/memos/mem_reader/read_multi_modal/assistant_parser.py b/src/memos/mem_reader/read_multi_modal/assistant_parser.py index 8e035bb95..6ab74cbbb 100644 --- a/src/memos/mem_reader/read_multi_modal/assistant_parser.py +++ b/src/memos/mem_reader/read_multi_modal/assistant_parser.py @@ -227,6 +227,10 @@ def parse_fast( # Combine all content parts content = " ".join(content_parts) if content_parts else "" + # If content is empty but we have tool_calls, audio, or refusal, still create memory + if not content and not tool_calls and not audio and not refusal: + return [] + parts = [f"{role}: "] if chat_time: parts.append(f"[{chat_time}]: ") diff --git a/src/memos/mem_reader/read_multi_modal/file_content_parser.py b/src/memos/mem_reader/read_multi_modal/file_content_parser.py index 32769d764..12b44eae8 100644 --- a/src/memos/mem_reader/read_multi_modal/file_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/file_content_parser.py @@ -5,11 +5,15 @@ from memos.embedders.base import BaseEmbedder from memos.llms.base import BaseLLM from memos.log import get_logger -from memos.memories.textual.item import SourceMessage, TextualMemoryItem +from memos.memories.textual.item import ( + SourceMessage, + TextualMemoryItem, + TreeNodeTextualMemoryMetadata, +) from memos.parsers.factory import ParserFactory from memos.types.openai_chat_completion_types import File -from .base import BaseMessageParser +from .base import BaseMessageParser, _derive_key logger = get_logger(__name__) @@ -121,7 +125,111 @@ def parse_fast( info: dict[str, Any], **kwargs, ) -> list[TextualMemoryItem]: - return [] + """ + Parse file content part in fast mode. + + Fast mode extracts file information and creates a memory item without parsing file content. + Handles various file parameter scenarios: + - file_data: base64 encoded data, URL, or plain text content + - file_id: ID of an uploaded file + - filename: name of the file + + Args: + message: File content part to parse (dict with "type": "file" and "file": {...}) + info: Dictionary containing user_id and session_id + **kwargs: Additional parameters + + Returns: + List of TextualMemoryItem objects + """ + if not isinstance(message, dict): + logger.warning(f"[FileContentParser] Expected dict, got {type(message)}") + return [] + + # Extract file information + file_info = message.get("file", {}) + if not isinstance(file_info, dict): + logger.warning(f"[FileContentParser] Expected file dict, got {type(file_info)}") + return [] + + # Extract file parameters (all are optional) + file_data = file_info.get("file_data", "") + file_id = file_info.get("file_id", "") + filename = file_info.get("filename", "") + + # Build content string based on available information + content_parts = [] + + # Priority 1: If file_data is provided, use it (could be base64, URL, or plain text) + if file_data: + # In fast mode, we don't decode base64 or fetch URLs, just record the reference + if isinstance(file_data, str): + # Check if it looks like base64 (starts with data: or is long base64 string) + if file_data.startswith("data:") or ( + len(file_data) > 100 + and all( + c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" + for c in file_data[:100] + ) + ): + content_parts.append(f"[File Data (base64/encoded): {len(file_data)} chars]") + # Check if it looks like a URL + elif file_data.startswith(("http://", "https://", "file://")): + content_parts.append(f"[File URL: {file_data}]") + else: + # TODO: split into multiple memory items + content_parts.append(file_data) + else: + content_parts.append(f"[File Data: {type(file_data).__name__}]") + + # Priority 2: If file_id is provided, reference it + if file_id: + content_parts.append(f"[File ID: {file_id}]") + + # Priority 3: If filename is provided, include it + if filename: + content_parts.append(f"[Filename: {filename}]") + + # If no content can be extracted, create a placeholder + if not content_parts: + content_parts.append("[File: unknown]") + + # Combine content parts + content = " ".join(content_parts) + + # Create source + source = self.create_source(message, info) + + # Extract info fields + info_ = info.copy() + user_id = info_.pop("user_id", "") + session_id = info_.pop("session_id", "") + + # For file content parts, default to LongTermMemory + # (since we don't have role information at this level) + memory_type = "LongTermMemory" + + # Create memory item + memory_item = TextualMemoryItem( + memory=content, + metadata=TreeNodeTextualMemoryMetadata( + user_id=user_id, + session_id=session_id, + memory_type=memory_type, + status="activated", + tags=["mode:fast", "multimodal:file"], + key=_derive_key(content), + embedding=self.embedder.embed([content])[0], + usage=[], + sources=[source], + background="", + confidence=0.99, + type="fact", + info=info_, + ), + ) + + return [memory_item] def parse_fine( self, diff --git a/src/memos/mem_reader/read_multi_modal/image_parser.py b/src/memos/mem_reader/read_multi_modal/image_parser.py new file mode 100644 index 000000000..610bc122f --- /dev/null +++ b/src/memos/mem_reader/read_multi_modal/image_parser.py @@ -0,0 +1,93 @@ +"""Parser for image_url content parts.""" + +from typing import Any + +from memos.embedders.base import BaseEmbedder +from memos.llms.base import BaseLLM +from memos.log import get_logger +from memos.memories.textual.item import SourceMessage, TextualMemoryItem +from memos.types.openai_chat_completion_types import ChatCompletionContentPartImageParam + +from .base import BaseMessageParser + + +logger = get_logger(__name__) + + +class ImageParser(BaseMessageParser): + """Parser for image_url content parts.""" + + def __init__(self, embedder: BaseEmbedder, llm: BaseLLM | None = None): + """ + Initialize ImageParser. + + Args: + embedder: Embedder for generating embeddings + llm: Optional LLM for fine mode processing + """ + super().__init__(embedder, llm) + + def create_source( + self, + message: ChatCompletionContentPartImageParam, + info: dict[str, Any], + ) -> SourceMessage: + """Create SourceMessage from image_url content part.""" + if isinstance(message, dict): + image_url = message.get("image_url", {}) + if isinstance(image_url, dict): + url = image_url.get("url", "") + detail = image_url.get("detail", "auto") + else: + url = str(image_url) + detail = "auto" + return SourceMessage( + type="image", + content=f"[image_url]: {url}", + original_part=message, + url=url, + detail=detail, + ) + return SourceMessage(type="image", content=str(message)) + + def rebuild_from_source( + self, + source: SourceMessage, + ) -> ChatCompletionContentPartImageParam: + """Rebuild image_url content part from SourceMessage.""" + # Use original_part if available + if hasattr(source, "original_part") and source.original_part: + return source.original_part + + # Rebuild from source fields + url = getattr(source, "url", "") or (source.content or "").replace("[image_url]: ", "") + detail = getattr(source, "detail", "auto") + return { + "type": "image_url", + "image_url": { + "url": url, + "detail": detail, + }, + } + + def parse_fast( + self, + message: ChatCompletionContentPartImageParam, + info: dict[str, Any], + **kwargs, + ) -> list[TextualMemoryItem]: + """Parse image_url in fast mode - returns empty list as images need fine mode processing.""" + # In fast mode, images are not processed (they need vision models) + # They will be processed in fine mode via process_transfer + return [] + + def parse_fine( + self, + message: ChatCompletionContentPartImageParam, + info: dict[str, Any], + **kwargs, + ) -> list[TextualMemoryItem]: + """Parse image_url in fine mode - placeholder for future vision model integration.""" + # Fine mode processing would use vision models to extract text from images + # For now, return empty list + return [] diff --git a/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py b/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py index f1214ef5b..3c60c3143 100644 --- a/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py +++ b/src/memos/mem_reader/read_multi_modal/multi_modal_parser.py @@ -15,6 +15,7 @@ from .assistant_parser import AssistantParser from .base import BaseMessageParser from .file_content_parser import FileContentParser +from .image_parser import ImageParser from .string_parser import StringParser from .system_parser import SystemParser from .text_content_parser import TextContentParser @@ -55,7 +56,7 @@ def __init__( self.tool_parser = ToolParser(embedder, llm) self.text_content_parser = TextContentParser(embedder, llm) self.file_content_parser = FileContentParser(embedder, llm, parser) - self.image_parser = None # future + self.image_parser = ImageParser(embedder, llm) self.audio_parser = None # future self.role_parsers = { @@ -69,7 +70,12 @@ def __init__( "text": self.text_content_parser, "file": self.file_content_parser, "image": self.image_parser, + "image_url": self.image_parser, # Support both "image" and "image_url" "audio": self.audio_parser, + # Custom tool formats + "tool_description": self.tool_parser, + "tool_input": self.tool_parser, + "tool_output": self.tool_parser, } def _get_parser(self, message: Any) -> BaseMessageParser | None: diff --git a/src/memos/mem_reader/read_multi_modal/string_parser.py b/src/memos/mem_reader/read_multi_modal/string_parser.py index 8d65f5c8a..3d0837425 100644 --- a/src/memos/mem_reader/read_multi_modal/string_parser.py +++ b/src/memos/mem_reader/read_multi_modal/string_parser.py @@ -8,16 +8,25 @@ from memos.embedders.base import BaseEmbedder from memos.llms.base import BaseLLM from memos.log import get_logger -from memos.memories.textual.item import SourceMessage, TextualMemoryItem +from memos.memories.textual.item import ( + SourceMessage, + TextualMemoryItem, + TreeNodeTextualMemoryMetadata, +) -from .base import BaseMessageParser +from .base import BaseMessageParser, _derive_key logger = get_logger(__name__) class StringParser(BaseMessageParser): - """Parser for string format messages.""" + """Parser for string format messages. + + Handles simple string messages in both fast and fine modes. + - Fast mode: Directly converts string to memory item + - Fine mode: Uses LLM to extract structured memories from string + """ def __init__(self, embedder: BaseEmbedder, llm: BaseLLM | None = None): """ @@ -44,8 +53,7 @@ def rebuild_from_source( self, source: SourceMessage, ) -> str: - """Rebuild string message from SourceMessage.""" - return source.content or "" + """We only need rebuild from specific multimodal source""" def parse_fast( self, @@ -53,7 +61,61 @@ def parse_fast( info: dict[str, Any], **kwargs, ) -> list[TextualMemoryItem]: - return [] + """ + Parse string message in fast mode. + + Fast mode directly converts the string to a memory item without LLM processing. + This is equivalent to simple_struct fast mode for string messages. + + Args: + message: String message to parse + info: Dictionary containing user_id and session_id + **kwargs: Additional parameters + + Returns: + List of TextualMemoryItem objects + """ + if not isinstance(message, str): + logger.warning(f"[StringParser] Expected str, got {type(message)}") + return [] + + content = message.strip() + if not content: + return [] + + # Create source + source = self.create_source(message, info) + + # Extract info fields + info_ = info.copy() + user_id = info_.pop("user_id", "") + session_id = info_.pop("session_id", "") + + # For string messages, default to LongTermMemory + # (since we don't have role information) + memory_type = "LongTermMemory" + + # Create memory item + memory_item = TextualMemoryItem( + memory=content, + metadata=TreeNodeTextualMemoryMetadata( + user_id=user_id, + session_id=session_id, + memory_type=memory_type, + status="activated", + tags=["mode:fast"], + key=_derive_key(content), + embedding=self.embedder.embed([content])[0], + usage=[], + sources=[source], + background="", + confidence=0.99, + type="fact", + info=info_, + ), + ) + + return [memory_item] def parse_fine( self, @@ -61,4 +123,9 @@ def parse_fine( info: dict[str, Any], **kwargs, ) -> list[TextualMemoryItem]: + logger.info( + "str memory is inherently a " + "text-only modality. No special multimodal handling" + " is required in fine mode." + ) return [] diff --git a/src/memos/mem_reader/read_multi_modal/text_content_parser.py b/src/memos/mem_reader/read_multi_modal/text_content_parser.py index 051d5ec47..5ff0a76fd 100644 --- a/src/memos/mem_reader/read_multi_modal/text_content_parser.py +++ b/src/memos/mem_reader/read_multi_modal/text_content_parser.py @@ -1,21 +1,34 @@ -"""Parser for text content parts (RawMessageList).""" +"""Parser for text content parts (RawMessageList). + +Handles text content parts in multimodal messages. +Text content parts are typically used in user/assistant messages with multimodal content. +""" from typing import Any from memos.embedders.base import BaseEmbedder from memos.llms.base import BaseLLM from memos.log import get_logger -from memos.memories.textual.item import SourceMessage, TextualMemoryItem +from memos.memories.textual.item import ( + SourceMessage, + TextualMemoryItem, + TreeNodeTextualMemoryMetadata, +) from memos.types.openai_chat_completion_types import ChatCompletionContentPartTextParam -from .base import BaseMessageParser +from .base import BaseMessageParser, _derive_key logger = get_logger(__name__) class TextContentParser(BaseMessageParser): - """Parser for text content parts.""" + """Parser for text content parts. + + Handles text content parts in both fast and fine modes. + - Fast mode: Directly converts text content to memory item + - Fine mode: Returns empty list (text content is handled at parent message level) + """ def __init__(self, embedder: BaseEmbedder, llm: BaseLLM | None = None): """ @@ -46,16 +59,7 @@ def rebuild_from_source( self, source: SourceMessage, ) -> ChatCompletionContentPartTextParam: - """Rebuild text content part from SourceMessage.""" - # Use original_part if available - if hasattr(source, "original_part") and source.original_part: - return source.original_part - - # Rebuild from source fields - return { - "type": "text", - "text": source.content or "", - } + """We only need rebuild from specific multimodal source""" def parse_fast( self, @@ -63,7 +67,55 @@ def parse_fast( info: dict[str, Any], **kwargs, ) -> list[TextualMemoryItem]: - return [] + """ + Parse text content part in fast mode. + """ + if not isinstance(message, dict): + logger.warning(f"[TextContentParser] Expected dict, got {type(message)}") + return [] + + # Extract text content + text = message.get("text", "") + if not isinstance(text, str): + text = str(text) if text is not None else "" + + content = text.strip() + if not content: + return [] + + # Create source + source = self.create_source(message, info) + + # Extract info fields + info_ = info.copy() + user_id = info_.pop("user_id", "") + session_id = info_.pop("session_id", "") + + # For text content parts, default to LongTermMemory + # (since we don't have role information at this level) + memory_type = "LongTermMemory" + + # Create memory item + memory_item = TextualMemoryItem( + memory=content, + metadata=TreeNodeTextualMemoryMetadata( + user_id=user_id, + session_id=session_id, + memory_type=memory_type, + status="activated", + tags=["mode:fast"], + key=_derive_key(content), + embedding=self.embedder.embed([content])[0], + usage=[], + sources=[source], + background="", + confidence=0.99, + type="fact", + info=info_, + ), + ) + + return [memory_item] def parse_fine( self, @@ -71,4 +123,8 @@ def parse_fine( info: dict[str, Any], **kwargs, ) -> list[TextualMemoryItem]: + logger.info( + "Text content part is inherently a text-only modality. " + "Fine mode processing is handled at the parent message level (user/assistant)." + ) return [] diff --git a/src/memos/mem_reader/read_multi_modal/tool_parser.py b/src/memos/mem_reader/read_multi_modal/tool_parser.py index f7437312d..7a11d931a 100644 --- a/src/memos/mem_reader/read_multi_modal/tool_parser.py +++ b/src/memos/mem_reader/read_multi_modal/tool_parser.py @@ -29,16 +29,52 @@ def __init__(self, embedder: BaseEmbedder, llm: BaseLLM | None = None): def create_source( self, - message: ChatCompletionToolMessageParam, + message: ChatCompletionToolMessageParam | dict[str, Any], info: dict[str, Any], ) -> SourceMessage: - """Create SourceMessage from tool message.""" + """Create SourceMessage from tool message or custom tool format.""" if not isinstance(message, dict): return SourceMessage(type="chat", role="tool") + # Handle custom tool formats (tool_description, tool_input, tool_output) + msg_type = message.get("type", "") + if msg_type == "tool_description": + name = message.get("name", "") + description = message.get("description", "") + parameters = message.get("parameters", {}) + content = f"[tool_description] name={name}, description={description}, parameters={parameters}" + return SourceMessage( + type="tool_description", + content=content, + original_part=message, + ) + elif msg_type == "tool_input": + call_id = message.get("call_id", "") + name = message.get("name", "") + argument = message.get("argument", {}) + content = f"[tool_input] call_id={call_id}, name={name}, argument={argument}" + return SourceMessage( + type="tool_input", + content=content, + message_id=call_id, + original_part=message, + ) + elif msg_type == "tool_output": + call_id = message.get("call_id", "") + name = message.get("name", "") + output = message.get("output", {}) + content = f"[tool_output] call_id={call_id}, name={name}, output={output}" + return SourceMessage( + type="tool_output", + content=content, + message_id=call_id, + original_part=message, + ) + + # Handle standard tool message content = _extract_text_from_content(message.get("content", "")) return SourceMessage( - type="chat", + type="tool", role="tool", chat_time=message.get("chat_time"), message_id=message.get("message_id"), @@ -60,11 +96,123 @@ def rebuild_from_source( def parse_fast( self, - message: ChatCompletionToolMessageParam, + message: ChatCompletionToolMessageParam | dict[str, Any], info: dict[str, Any], **kwargs, ) -> list[TextualMemoryItem]: - return super().parse_fast(message, info, **kwargs) + """ + Parse tool message in fast mode. + + Handles both standard tool messages and custom tool formats: + - Standard tool message: role="tool", content, tool_call_id + - Custom formats: tool_description, tool_input, tool_output + + Args: + message: Tool message to parse + info: Dictionary containing user_id and session_id + **kwargs: Additional parameters + + Returns: + List of TextualMemoryItem objects + """ + from memos.memories.textual.item import TreeNodeTextualMemoryMetadata + + from .base import _derive_key + + if not isinstance(message, dict): + logger.warning(f"[ToolParser] Expected dict, got {type(message)}") + return [] + + # Handle custom tool formats (tool_description, tool_input, tool_output) + msg_type = message.get("type", "") + if msg_type in ("tool_description", "tool_input", "tool_output"): + # Create source + source = self.create_source(message, info) + content = source.content or "" + if not content: + return [] + + # Extract info fields + info_ = info.copy() + user_id = info_.pop("user_id", "") + session_id = info_.pop("session_id", "") + + # Create memory item + memory_item = TextualMemoryItem( + memory=content, + metadata=TreeNodeTextualMemoryMetadata( + user_id=user_id, + session_id=session_id, + memory_type="LongTermMemory", + status="activated", + tags=["mode:fast"], + key=_derive_key(content), + embedding=self.embedder.embed([content])[0], + usage=[], + sources=[source], + background="", + confidence=0.99, + type="fact", + info=info_, + ), + ) + return [memory_item] + + # Handle standard tool message (role="tool") + role = message.get("role", "").strip().lower() + if role != "tool": + logger.warning(f"[ToolParser] Expected role='tool', got role='{role}'") + return [] + + # Extract content from tool message + content = _extract_text_from_content(message.get("content", "")) + if not content: + return [] + + # Build formatted line similar to assistant_parser + tool_call_id = message.get("tool_call_id", "") + chat_time = message.get("chat_time") + + parts = [f"{role}: "] + if chat_time: + parts.append(f"[{chat_time}]: ") + if tool_call_id: + parts.append(f"[tool_call_id: {tool_call_id}]: ") + prefix = "".join(parts) + line = f"{prefix}{content}\n" + + # Create source + source = self.create_source(message, info) + + # Extract info fields + info_ = info.copy() + user_id = info_.pop("user_id", "") + session_id = info_.pop("session_id", "") + + # Tool messages are typically LongTermMemory (they're system/assistant tool results) + memory_type = "LongTermMemory" + + # Create memory item + memory_item = TextualMemoryItem( + memory=line, + metadata=TreeNodeTextualMemoryMetadata( + user_id=user_id, + session_id=session_id, + memory_type=memory_type, + status="activated", + tags=["mode:fast"], + key=_derive_key(line), + embedding=self.embedder.embed([line])[0], + usage=[], + sources=[source], + background="", + confidence=0.99, + type="fact", + info=info_, + ), + ) + + return [memory_item] def parse_fine( self, diff --git a/src/memos/mem_reader/read_multi_modal/utils.py b/src/memos/mem_reader/read_multi_modal/utils.py index c14710650..bb2e77e38 100644 --- a/src/memos/mem_reader/read_multi_modal/utils.py +++ b/src/memos/mem_reader/read_multi_modal/utils.py @@ -93,6 +93,11 @@ def coerce_scene_data(scene_data: SceneDataInput, scene_type: str) -> list[Messa if not items: continue + # Keep string as-is (MessagesType supports str) + if isinstance(items, str): + complete_scene_data.append(items) + continue + # ONLY add chat_time if it's a MessageList if not _is_message_list(items): complete_scene_data.append(items) diff --git a/src/memos/memories/textual/item.py b/src/memos/memories/textual/item.py index 12be08057..b7956bfec 100644 --- a/src/memos/memories/textual/item.py +++ b/src/memos/memories/textual/item.py @@ -35,7 +35,7 @@ class SourceMessage(BaseModel): """ type: str | None = "chat" - role: Literal["user", "assistant", "system"] | None = None + role: Literal["user", "assistant", "system", "tool"] | None = None chat_time: str | None = None message_id: str | None = None content: str | None = None